LLVM 20.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/CallingConv.h"
42#include "llvm/IR/Constants.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Intrinsics.h"
52#include "llvm/MC/MCAsmInfo.h"
53#include "llvm/MC/MCContext.h"
54#include "llvm/MC/MCExpr.h"
55#include "llvm/MC/MCSymbol.h"
57#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <bitset>
64#include <cctype>
65#include <numeric>
66using namespace llvm;
67
68#define DEBUG_TYPE "x86-isel"
69
71 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
77
79 "x86-br-merging-base-cost", cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
88
90 "x86-br-merging-ccmp-bias", cl::init(6),
91 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
94
95static cl::opt<bool>
96 WidenShift("x86-widen-shift", cl::init(true),
97 cl::desc("Replace narrow shifts with wider shifts."),
99
101 "x86-br-merging-likely-bias", cl::init(0),
102 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
108 "branches."),
109 cl::Hidden);
110
112 "x86-br-merging-unlikely-bias", cl::init(-1),
113 cl::desc(
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
120 "branches."),
121 cl::Hidden);
122
124 "mul-constant-optimization", cl::init(true),
125 cl::desc("Replace 'mul x, Const' with more effective instructions like "
126 "SHIFT, LEA, etc."),
127 cl::Hidden);
128
130 const X86Subtarget &STI)
131 : TargetLowering(TM), Subtarget(STI) {
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
133 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
134
135 // Set up the TargetLowering object.
136
137 // X86 is weird. It always uses i8 for shift amounts and setcc results.
139 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
141
142 // X86 instruction cache is coherent with its data cache so we can use the
143 // default expansion to a no-op.
145
146 // For 64-bit, since we have so many registers, use the ILP scheduler.
147 // For 32-bit, use the register pressure specific scheduling.
148 // For Atom, always use ILP scheduling.
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
153 else
155 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
157
158 // Bypass expensive divides and use cheaper ones.
159 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
160 if (Subtarget.hasSlowDivide32())
161 addBypassSlowDiv(32, 8);
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
163 addBypassSlowDiv(64, 32);
164 }
165
166 // Setup Windows compiler runtime calls.
167 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
168 static const struct {
169 const RTLIB::Libcall Op;
170 const char * const Name;
171 const CallingConv::ID CC;
172 } LibraryCalls[] = {
173 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
174 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
175 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
176 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
177 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
178 };
179
180 for (const auto &LC : LibraryCalls) {
181 setLibcallName(LC.Op, LC.Name);
182 setLibcallCallingConv(LC.Op, LC.CC);
183 }
184 }
185
186 if (Subtarget.canUseCMPXCHG16B())
188 else if (Subtarget.canUseCMPXCHG8B())
190 else
192
193 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
194
196
197 // Set up the register classes.
198 addRegisterClass(MVT::i8, &X86::GR8RegClass);
199 addRegisterClass(MVT::i16, &X86::GR16RegClass);
200 addRegisterClass(MVT::i32, &X86::GR32RegClass);
201 if (Subtarget.is64Bit())
202 addRegisterClass(MVT::i64, &X86::GR64RegClass);
203
204 for (MVT VT : MVT::integer_valuetypes())
206
207 // We don't accept any truncstore of integer registers.
208 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
212 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
213 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
214
215 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
216
217 // SETOEQ and SETUNE require checking two conditions.
218 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
221 }
222
223 // Integer absolute.
224 if (Subtarget.canUseCMOV()) {
225 setOperationAction(ISD::ABS , MVT::i16 , Custom);
226 setOperationAction(ISD::ABS , MVT::i32 , Custom);
227 if (Subtarget.is64Bit())
228 setOperationAction(ISD::ABS , MVT::i64 , Custom);
229 }
230
231 // Absolute difference.
232 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
233 setOperationAction(Op , MVT::i8 , Custom);
234 setOperationAction(Op , MVT::i16 , Custom);
235 setOperationAction(Op , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(Op , MVT::i64 , Custom);
238 }
239
240 // Signed saturation subtraction.
244 if (Subtarget.is64Bit())
246
247 // Funnel shifts.
248 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
249 // For slow shld targets we only lower for code size.
250 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
251
252 setOperationAction(ShiftOp , MVT::i8 , Custom);
253 setOperationAction(ShiftOp , MVT::i16 , Custom);
254 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
255 if (Subtarget.is64Bit())
256 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
257 }
258
259 if (!Subtarget.useSoftFloat()) {
260 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
261 // operation.
266 // We have an algorithm for SSE2, and we turn this into a 64-bit
267 // FILD or VCVTUSI2SS/SD for other targets.
270 // We have an algorithm for SSE2->double, and we turn this into a
271 // 64-bit FILD followed by conditional FADD for other targets.
274
275 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
276 // this operation.
279 // SSE has no i16 to fp conversion, only i32. We promote in the handler
280 // to allow f80 to use i16 and f64 to use i16 with sse1 only
283 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
287 // are Legal, f80 is custom lowered.
290
291 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292 // this operation.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
301 // are Legal, f80 is custom lowered.
304
305 // Handle FP_TO_UINT by promoting the destination to a larger signed
306 // conversion.
308 // FIXME: This doesn't generate invalid exception when it should. PR44019.
311 // FIXME: This doesn't generate invalid exception when it should. PR44019.
317
322
323 if (!Subtarget.is64Bit()) {
326 }
327 }
328
329 if (Subtarget.hasSSE2()) {
330 // Custom lowering for saturating float to int conversions.
331 // We handle promotion to larger result types manually.
332 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
335 }
337 if (Subtarget.is64Bit()) {
341 }
342 }
343 if (Subtarget.hasAVX10_2()) {
346 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
347 MVT::v4i64}) {
350 }
351 if (Subtarget.hasAVX10_2_512()) {
354 }
355 if (Subtarget.is64Bit()) {
358 }
359 }
360
361 // Handle address space casts between mixed sized pointers.
364
365 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
366 if (!Subtarget.hasSSE2()) {
372 if (Subtarget.is64Bit()) {
374 // Without SSE, i64->f64 goes through memory.
376 }
377 } else if (!Subtarget.is64Bit())
379
380 // Scalar integer divide and remainder are lowered to use operations that
381 // produce two results, to match the available instructions. This exposes
382 // the two-result form to trivial CSE, which is able to combine x/y and x%y
383 // into a single instruction.
384 //
385 // Scalar integer multiply-high is also lowered to use two-result
386 // operations, to match the available instructions. However, plain multiply
387 // (low) operations are left as Legal, as there are single-result
388 // instructions for this in x86. Using the two-result multiply instructions
389 // when both high and low results are needed must be arranged by dagcombine.
390 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
397 }
398
399 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
401 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
402 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
405 }
406 if (Subtarget.is64Bit())
411
412 setOperationAction(ISD::FREM , MVT::f32 , Expand);
413 setOperationAction(ISD::FREM , MVT::f64 , Expand);
414 setOperationAction(ISD::FREM , MVT::f80 , Expand);
415 setOperationAction(ISD::FREM , MVT::f128 , Expand);
416
417 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
423 }
424
425 // Promote the i8 variants and force them on up to i32 which has a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
429 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
430 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
431 // promote that too.
432 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
434
435 if (!Subtarget.hasBMI()) {
436 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
438 if (Subtarget.is64Bit()) {
439 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
441 }
442 }
443
444 if (Subtarget.hasLZCNT()) {
445 // When promoting the i8 variants, force them to i32 for a shorter
446 // encoding.
447 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
449 } else {
450 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
451 if (VT == MVT::i64 && !Subtarget.is64Bit())
452 continue;
455 }
456 }
457
460 // Special handling for half-precision floating point conversions.
461 // If we don't have F16C support, then lower half float conversions
462 // into library calls.
464 Op, MVT::f32,
465 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
466 // There's never any support for operations beyond MVT::f32.
467 setOperationAction(Op, MVT::f64, Expand);
468 setOperationAction(Op, MVT::f80, Expand);
469 setOperationAction(Op, MVT::f128, Expand);
470 }
471
472 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
475 }
476
477 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
478 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
479 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
480 setTruncStoreAction(VT, MVT::f16, Expand);
481 setTruncStoreAction(VT, MVT::bf16, Expand);
482
485 }
486
490 if (Subtarget.is64Bit())
492 if (Subtarget.hasPOPCNT()) {
493 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
494 // popcntw is longer to encode than popcntl and also has a false dependency
495 // on the dest that popcntl hasn't had since Cannon Lake.
496 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
497 } else {
502 }
503
505
506 if (!Subtarget.hasMOVBE())
508
509 // X86 wants to expand cmov itself.
510 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
515 }
516 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
517 if (VT == MVT::i64 && !Subtarget.is64Bit())
518 continue;
521 }
522
523 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
526
528 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
529 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
533 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
534 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
535
536 // Darwin ABI issue.
537 for (auto VT : { MVT::i32, MVT::i64 }) {
538 if (VT == MVT::i64 && !Subtarget.is64Bit())
539 continue;
546 }
547
548 // 64-bit shl, sra, srl (iff 32-bit x86)
549 for (auto VT : { MVT::i32, MVT::i64 }) {
550 if (VT == MVT::i64 && !Subtarget.is64Bit())
551 continue;
555 }
556
557 if (Subtarget.hasSSEPrefetch())
559
561
562 // Expand certain atomics
563 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
571 }
572
573 if (!Subtarget.is64Bit())
575
576 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
577 // All CPUs supporting AVX will atomically load/store aligned 128-bit
578 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
581 }
582
583 if (Subtarget.canUseCMPXCHG16B())
585
586 // FIXME - use subtarget debug flags
587 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
588 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
589 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
591 }
592
595
598
599 setOperationAction(ISD::TRAP, MVT::Other, Legal);
601 if (Subtarget.isTargetPS())
603 else
605
606 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
608 setOperationAction(ISD::VAEND , MVT::Other, Expand);
609 bool Is64Bit = Subtarget.is64Bit();
610 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
611 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
612
615
617
618 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
621
623
624 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
625 setOperationAction(ISD::FABS, VT, Action);
626 setOperationAction(ISD::FNEG, VT, Action);
628 setOperationAction(ISD::FREM, VT, Action);
629 setOperationAction(ISD::FMA, VT, Action);
630 setOperationAction(ISD::FMINNUM, VT, Action);
631 setOperationAction(ISD::FMAXNUM, VT, Action);
636 setOperationAction(ISD::FSIN, VT, Action);
637 setOperationAction(ISD::FCOS, VT, Action);
638 setOperationAction(ISD::FSINCOS, VT, Action);
639 setOperationAction(ISD::FTAN, VT, Action);
640 setOperationAction(ISD::FSQRT, VT, Action);
641 setOperationAction(ISD::FPOW, VT, Action);
642 setOperationAction(ISD::FPOWI, VT, Action);
643 setOperationAction(ISD::FLOG, VT, Action);
644 setOperationAction(ISD::FLOG2, VT, Action);
645 setOperationAction(ISD::FLOG10, VT, Action);
646 setOperationAction(ISD::FEXP, VT, Action);
647 setOperationAction(ISD::FEXP2, VT, Action);
648 setOperationAction(ISD::FEXP10, VT, Action);
649 setOperationAction(ISD::FCEIL, VT, Action);
650 setOperationAction(ISD::FFLOOR, VT, Action);
652 setOperationAction(ISD::FRINT, VT, Action);
653 setOperationAction(ISD::BR_CC, VT, Action);
654 setOperationAction(ISD::SETCC, VT, Action);
657 setOperationAction(ISD::FROUND, VT, Action);
659 setOperationAction(ISD::FTRUNC, VT, Action);
660 setOperationAction(ISD::FLDEXP, VT, Action);
661 };
662
663 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
664 // f16, f32 and f64 use SSE.
665 // Set up the FP register classes.
666 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
667 : &X86::FR16RegClass);
668 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
669 : &X86::FR32RegClass);
670 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
671 : &X86::FR64RegClass);
672
673 // Disable f32->f64 extload as we can only generate this in one instruction
674 // under optsize. So its easier to pattern match (fpext (load)) for that
675 // case instead of needing to emit 2 instructions for extload in the
676 // non-optsize case.
677 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
678
679 for (auto VT : { MVT::f32, MVT::f64 }) {
680 // Use ANDPD to simulate FABS.
682
683 // Use XORP to simulate FNEG.
685
686 // Use ANDPD and ORPD to simulate FCOPYSIGN.
688
689 // These might be better off as horizontal vector ops.
692
693 // We don't support sin/cos/fmod
697 }
698
699 // Half type will be promoted by default.
700 setF16Action(MVT::f16, Promote);
708
739
740 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
741 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
742
743 // Lower this to MOVMSK plus an AND.
746
747 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
748 (UseX87 || Is64Bit)) {
749 // Use SSE for f32, x87 for f64.
750 // Set up the FP register classes.
751 addRegisterClass(MVT::f32, &X86::FR32RegClass);
752 if (UseX87)
753 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
754
755 // Use ANDPS to simulate FABS.
757
758 // Use XORP to simulate FNEG.
760
761 if (UseX87)
763
764 // Use ANDPS and ORPS to simulate FCOPYSIGN.
765 if (UseX87)
768
769 // We don't support sin/cos/fmod
773
774 if (UseX87) {
775 // Always expand sin/cos functions even though x87 has an instruction.
779 }
780 } else if (UseX87) {
781 // f32 and f64 in x87.
782 // Set up the FP register classes.
783 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
784 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
785
786 for (auto VT : { MVT::f32, MVT::f64 }) {
789
790 // Always expand sin/cos functions even though x87 has an instruction.
794 }
795 }
796
797 // Expand FP32 immediates into loads from the stack, save special cases.
798 if (isTypeLegal(MVT::f32)) {
799 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
800 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
801 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
802 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
803 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
804 } else // SSE immediates.
805 addLegalFPImmediate(APFloat(+0.0f)); // xorps
806 }
807 // Expand FP64 immediates into loads from the stack, save special cases.
808 if (isTypeLegal(MVT::f64)) {
809 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
810 addLegalFPImmediate(APFloat(+0.0)); // FLD0
811 addLegalFPImmediate(APFloat(+1.0)); // FLD1
812 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
813 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
814 } else // SSE immediates.
815 addLegalFPImmediate(APFloat(+0.0)); // xorpd
816 }
817 // Support fp16 0 immediate.
818 if (isTypeLegal(MVT::f16))
819 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
820
821 // Handle constrained floating-point operations of scalar.
834
835 // We don't support FMA.
838
839 // f80 always uses X87.
840 if (UseX87) {
841 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
844 {
846 addLegalFPImmediate(TmpFlt); // FLD0
847 TmpFlt.changeSign();
848 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
849
850 bool ignored;
851 APFloat TmpFlt2(+1.0);
853 &ignored);
854 addLegalFPImmediate(TmpFlt2); // FLD1
855 TmpFlt2.changeSign();
856 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
857 }
858
859 // Always expand sin/cos functions even though x87 has an instruction.
860 // clang-format off
872 // clang-format on
873
885
886 // Handle constrained floating-point operations of scalar.
892 if (isTypeLegal(MVT::f16)) {
895 } else {
897 }
898 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
899 // as Custom.
901 }
902
903 // f128 uses xmm registers, but most operations require libcalls.
904 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
905 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
906 : &X86::VR128RegClass);
907
908 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
909
920
924
925 // clang-format off
933 // clang-format on
934 // No STRICT_FSINCOS
937
940 // We need to custom handle any FP_ROUND with an f128 input, but
941 // LegalizeDAG uses the result type to know when to run a custom handler.
942 // So we have to list all legal floating point result types here.
943 if (isTypeLegal(MVT::f32)) {
946 }
947 if (isTypeLegal(MVT::f64)) {
950 }
951 if (isTypeLegal(MVT::f80)) {
955 }
956
958
959 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
960 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
961 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
962 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
963 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
964 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
965 }
966
967 // Always use a library call for pow.
968 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
969 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
970 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
971 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
972
981
982 // Some FP actions are always expanded for vector types.
983 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
984 MVT::v4f32, MVT::v8f32, MVT::v16f32,
985 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
986 // clang-format off
1000 // clang-format on
1001 }
1002
1003 // First set operation action for all vector types to either promote
1004 // (for widening) or expand (for scalarization). Then we will selectively
1005 // turn on ones that can be effectively codegen'd.
1045 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1046 setTruncStoreAction(InnerVT, VT, Expand);
1047
1048 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1049 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1050
1051 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1052 // types, we have to deal with them whether we ask for Expansion or not.
1053 // Setting Expand causes its own optimisation problems though, so leave
1054 // them legal.
1055 if (VT.getVectorElementType() == MVT::i1)
1056 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1057
1058 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1059 // split/scalarized right now.
1060 if (VT.getVectorElementType() == MVT::f16 ||
1061 VT.getVectorElementType() == MVT::bf16)
1062 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1063 }
1064 }
1065
1066 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1067 // with -msoft-float, disable use of MMX as well.
1068 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1069 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1070 // No operations on x86mmx supported, everything uses intrinsics.
1071 }
1072
1073 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1074 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1075 : &X86::VR128RegClass);
1076
1081
1082 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1083 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1091
1092 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1093 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1095
1101 }
1102
1103 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1104 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1105 : &X86::VR128RegClass);
1106
1107 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1108 // registers cannot be used even for integer operations.
1109 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1110 : &X86::VR128RegClass);
1111 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1112 : &X86::VR128RegClass);
1113 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1114 : &X86::VR128RegClass);
1115 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1116 : &X86::VR128RegClass);
1117 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1118 : &X86::VR128RegClass);
1119
1120 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1125 }
1126
1127 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1128 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1133 }
1134
1135 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1136 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1137 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1138
1139 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1140 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1141 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1142 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1143 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1144 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1145 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1146 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1147 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1148 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1151
1152 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1153 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1154 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1155
1156 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1158 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1160
1161 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1162
1163 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1164 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1165 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1166 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1167 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1168 }
1169
1180
1185
1186 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1192
1193 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1194 // setcc all the way to isel and prefer SETGT in some isel patterns.
1197 }
1198
1199 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1200 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1205
1206 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1212 }
1213
1214 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1218
1219 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1220 continue;
1221
1224 }
1225 setF16Action(MVT::v8f16, Expand);
1226 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1227 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1228 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1229 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1230 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1231 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1233
1234 // Custom lower v2i64 and v2f64 selects.
1241
1248
1249 // Custom legalize these to avoid over promotion or custom promotion.
1250 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1255 }
1256
1261
1264
1267
1268 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1273
1278
1279 // We want to legalize this to an f64 load rather than an i64 load on
1280 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1281 // store.
1282 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1283 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1284 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1285 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1286 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1288
1289 // Add 32-bit vector stores to help vectorization opportunities.
1290 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1292
1296 if (!Subtarget.hasAVX512())
1298
1302
1304
1321
1322 // In the customized shift lowering, the legal v4i32/v2i64 cases
1323 // in AVX2 will be recognized.
1324 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1328 if (VT == MVT::v2i64) continue;
1333 }
1334
1340 }
1341
1342 if (Subtarget.hasGFNI()) {
1347 }
1348
1349 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1350 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1351 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1352 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1353
1354 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1357 }
1358
1359 // These might be better off as horizontal vector ops.
1364 }
1365
1366 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1367 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1370 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1374 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1380
1382 }
1383
1384 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1385 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1386 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1387 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1388 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1389 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1390 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1391 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1392
1396
1397 // FIXME: Do we need to handle scalar-to-vector here?
1398 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1399 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1400
1401 // We directly match byte blends in the backend as they match the VSELECT
1402 // condition form.
1404
1405 // SSE41 brings specific instructions for doing vector sign extend even in
1406 // cases where we don't have SRA.
1407 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1410 }
1411
1412 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1413 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1414 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1415 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1416 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1417 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1418 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1419 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1420 }
1421
1422 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1423 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1424 // do the pre and post work in the vector domain.
1427 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1428 // so that DAG combine doesn't try to turn it into uint_to_fp.
1431 }
1432 }
1433
1434 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1436 }
1437
1438 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1439 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1440 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1443 }
1444
1445 // XOP can efficiently perform BITREVERSE with VPPERM.
1446 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1448 }
1449
1450 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1451 bool HasInt256 = Subtarget.hasInt256();
1452
1453 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1454 : &X86::VR256RegClass);
1455 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1456 : &X86::VR256RegClass);
1457 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1458 : &X86::VR256RegClass);
1459 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1460 : &X86::VR256RegClass);
1461 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1462 : &X86::VR256RegClass);
1463 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1464 : &X86::VR256RegClass);
1465 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1466 : &X86::VR256RegClass);
1467
1468 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1481
1483
1487
1493 }
1494
1495 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1496 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1497
1498 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1499 // even though v8i16 is a legal type.
1500 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1501 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1502 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1503 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1507
1514
1526
1527 if (!Subtarget.hasAVX512())
1529
1530 // In the customized shift lowering, the legal v8i32/v4i64 cases
1531 // in AVX2 will be recognized.
1532 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1538 if (VT == MVT::v4i64) continue;
1543 }
1544
1545 // These types need custom splitting if their input is a 128-bit vector.
1550
1554 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1555 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1558
1559 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1563 }
1564
1569
1570 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1575
1576 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1577 // setcc all the way to isel and prefer SETGT in some isel patterns.
1580 }
1581
1582 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1583 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1588
1589 if (Subtarget.hasAnyFMA()) {
1590 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1591 MVT::v2f64, MVT::v4f64 }) {
1594 }
1595 }
1596
1597 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1598 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1600 }
1601
1602 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1603 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1604 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1605 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1606
1607 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1608 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1609 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1610 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1612 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1613 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1615
1616 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1617 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1618
1619 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1620 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1621 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1622 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1623 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1624
1625 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1632 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1637
1638 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1639 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1640 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1641 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1644 }
1645
1646 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1649 }
1650
1651 if (HasInt256) {
1652 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1653 // when we have a 256bit-wide blend with immediate.
1656
1657 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1658 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1659 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1660 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1661 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1662 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1663 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1664 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1665 }
1666 }
1667
1668 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1669 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1670 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1672 }
1673
1674 // Extract subvector is special because the value type
1675 // (result) is 128-bit but the source is 256-bit wide.
1676 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1677 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1679 }
1680
1681 // Custom lower several nodes for 256-bit types.
1682 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1683 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1693 }
1694 setF16Action(MVT::v16f16, Expand);
1695 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1696 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1698 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1699 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1700 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1701 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1702
1703 if (HasInt256) {
1705
1706 // Custom legalize 2x32 to get a little better code.
1709
1710 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1711 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1713 }
1714 }
1715
1716 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1717 Subtarget.hasF16C()) {
1718 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1721 }
1722 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1725 }
1726 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1727 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1728 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1729 }
1730 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1731 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1732 }
1733
1734 // This block controls legalization of the mask vector sizes that are
1735 // available with AVX512. 512-bit vectors are in a separate block controlled
1736 // by useAVX512Regs.
1737 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1738 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1739 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1740 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1741 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1742 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1743
1747
1748 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1749 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1750 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1751 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1752 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1753 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1754 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1755 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1763
1764 // There is no byte sized k-register load or store without AVX512DQ.
1765 if (!Subtarget.hasDQI()) {
1766 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1767 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1768 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1769 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1770
1775 }
1776
1777 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1778 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1782 }
1783
1784 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1786
1787 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1791
1798 }
1799
1800 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1802 }
1803 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1804 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1807 }
1808 }
1809
1810 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1811 // elements. 512-bits can be disabled based on prefer-vector-width and
1812 // required-vector-width function attributes.
1813 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1814 bool HasBWI = Subtarget.hasBWI();
1815
1816 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1817 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1818 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1819 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1820 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1821 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1822 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1823
1824 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1825 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1826 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1827 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1828 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1829 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1830 if (HasBWI)
1831 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1832 }
1833
1834 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1845 }
1846 setOperationAction(ISD::LRINT, MVT::v16f32,
1847 Subtarget.hasDQI() ? Legal : Custom);
1848 setOperationAction(ISD::LRINT, MVT::v8f64,
1849 Subtarget.hasDQI() ? Legal : Custom);
1850 if (Subtarget.hasDQI())
1851 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1852
1853 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1858 }
1859
1860 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1865 }
1866
1873
1885
1886 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1887 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1888 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1889 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1890 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1891 if (HasBWI)
1892 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1893
1894 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1895 // to 512-bit rather than use the AVX2 instructions so that we can use
1896 // k-masks.
1897 if (!Subtarget.hasVLX()) {
1898 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1899 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1902 }
1903 }
1904
1906 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1907 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1917
1918 if (HasBWI) {
1919 // Extends from v64i1 masks to 512-bit vectors.
1923 }
1924
1925 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1938
1940 }
1941
1942 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1945 }
1946
1947 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1948 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1953 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1954 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1955 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1956
1957 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1958 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1959 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1960 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1962 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1963 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1964 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1965
1966 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1967 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1968
1969 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1979
1980 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1981 // setcc all the way to isel and prefer SETGT in some isel patterns.
1984 }
1985
1986 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1987 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1992
1993 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2000 }
2001
2002 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2003 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2004 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2006 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2007 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2008 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2009 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2014 }
2015
2016 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2017 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2018 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2019 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2020 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2021 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2022
2023 if (Subtarget.hasDQI()) {
2027 setOperationAction(Opc, MVT::v8i64, Custom);
2028 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2029 }
2030
2031 if (Subtarget.hasCDI()) {
2032 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2033 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2035 }
2036 } // Subtarget.hasCDI()
2037
2038 if (Subtarget.hasVPOPCNTDQ()) {
2039 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2041 }
2042
2043 // Extract subvector is special because the value type
2044 // (result) is 256-bit but the source is 512-bit wide.
2045 // 128-bit was made Legal under AVX1.
2046 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2047 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2049
2050 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2051 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2061 }
2062 setF16Action(MVT::v32f16, Expand);
2067 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2068 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2069 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2070
2071 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2076 }
2077 if (HasBWI) {
2078 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2081 }
2082 } else {
2083 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2084 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2085 }
2086
2087 if (Subtarget.hasVBMI2()) {
2088 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2091 }
2092
2093 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2094 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2095 }
2096
2097 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2098 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2100 }// useAVX512Regs
2101
2102 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2103 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2104 MVT::v4i64}) {
2107 }
2108 }
2109
2110 // This block controls legalization for operations that don't have
2111 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2112 // narrower widths.
2113 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2114 // These operations are handled on non-VLX by artificially widening in
2115 // isel patterns.
2116
2120
2121 if (Subtarget.hasDQI()) {
2122 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2123 // v2f32 UINT_TO_FP is already custom under SSE2.
2126 "Unexpected operation action!");
2127 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2132 }
2133
2134 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2140 }
2141
2142 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2145 }
2146
2147 // Custom legalize 2x32 to get a little better code.
2150
2151 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2152 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2154
2155 if (Subtarget.hasDQI()) {
2159 setOperationAction(Opc, MVT::v2i64, Custom);
2160 setOperationAction(Opc, MVT::v4i64, Custom);
2161 }
2162 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2163 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2164 }
2165
2166 if (Subtarget.hasCDI()) {
2167 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2169 }
2170 } // Subtarget.hasCDI()
2171
2172 if (Subtarget.hasVPOPCNTDQ()) {
2173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2175 }
2176
2177 // We can try to convert vectors to different sizes to leverage legal
2178 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2179 // then specialize to Legal below.
2180 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2181 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2182 MVT::v16i16, MVT::v8i8})
2184
2185 // Legal vpcompress depends on various AVX512 extensions.
2186 // Legal in AVX512F
2187 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2189
2190 // Legal in AVX512F + AVX512VL
2191 if (Subtarget.hasVLX())
2192 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2193 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2195
2196 // Legal in AVX512F + AVX512VBMI2
2197 if (Subtarget.hasVBMI2())
2198 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2200
2201 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2202 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2203 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2205 }
2206
2207 // This block control legalization of v32i1/v64i1 which are available with
2208 // AVX512BW..
2209 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2210 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2211 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2212
2213 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2224 }
2225
2226 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2228
2229 // Extends from v32i1 masks to 256-bit vectors.
2233
2234 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2235 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2236 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2237 }
2238
2239 // These operations are handled on non-VLX by artificially widening in
2240 // isel patterns.
2241 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2242
2243 if (Subtarget.hasBITALG()) {
2244 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2246 }
2247 }
2248
2249 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2250 auto setGroup = [&] (MVT VT) {
2261
2274
2276
2279
2285
2291
2295 };
2296
2297 // AVX512_FP16 scalar operations
2298 setGroup(MVT::f16);
2314
2317
2318 if (Subtarget.useAVX512Regs()) {
2319 setGroup(MVT::v32f16);
2325 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2332
2337 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2339 MVT::v32i16);
2340 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2342 MVT::v32i16);
2343 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2345 MVT::v32i16);
2346 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2348 MVT::v32i16);
2349
2353
2354 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2355 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2356
2361 }
2362
2363 if (Subtarget.hasVLX()) {
2364 setGroup(MVT::v8f16);
2365 setGroup(MVT::v16f16);
2366
2377
2388
2389 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2392
2396
2397 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2398 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2399 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2400 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2401
2402 // Need to custom widen these to prevent scalarization.
2403 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2404 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2405
2410
2415 }
2416 }
2417
2418 if (!Subtarget.useSoftFloat() &&
2419 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2420 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2421 : &X86::VR128RegClass);
2422 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2423 : &X86::VR256RegClass);
2424 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2425 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2426 // Set the operation action Custom to do the customization later.
2429 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2430 setF16Action(VT, Expand);
2431 if (!Subtarget.hasBF16())
2437 }
2438 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2439 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2440 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2441 }
2442 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2443 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2445 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2446 }
2447
2448 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2449 Subtarget.useAVX512Regs()) {
2450 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2451 setF16Action(MVT::v32bf16, Expand);
2452 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2453 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2454 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2456 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2460 }
2461
2462 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2463 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2475 }
2476 if (Subtarget.hasAVX10_2_512()) {
2477 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2478 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2479 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2480 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2481 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2482 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2483 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2484 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2485 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2488 }
2489 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2492 }
2493 }
2494
2495 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2496 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2497 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2498 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2499 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2500 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2501
2502 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2503 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2504 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2505 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2506 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2507
2508 if (Subtarget.hasBWI()) {
2509 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2510 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2511 }
2512
2513 if (Subtarget.hasFP16()) {
2514 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2523 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2532 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2537 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2542 }
2543 }
2544
2545 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2546 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2547 }
2548
2549 // We want to custom lower some of our intrinsics.
2553 if (!Subtarget.is64Bit()) {
2555 }
2556
2557 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2558 // handle type legalization for these operations here.
2559 //
2560 // FIXME: We really should do custom legalization for addition and
2561 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2562 // than generic legalization for 64-bit multiplication-with-overflow, though.
2563 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2564 if (VT == MVT::i64 && !Subtarget.is64Bit())
2565 continue;
2566 // Add/Sub/Mul with overflow operations are custom lowered.
2573
2574 // Support carry in as value rather than glue.
2580 }
2581
2582 // Combine sin / cos into _sincos_stret if it is available.
2583 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2584 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2587 }
2588
2589 if (Subtarget.isTargetWin64()) {
2590 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2591 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2592 setOperationAction(ISD::SREM, MVT::i128, Custom);
2593 setOperationAction(ISD::UREM, MVT::i128, Custom);
2602 }
2603
2604 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2605 // is. We should promote the value to 64-bits to solve this.
2606 // This is what the CRT headers do - `fmodf` is an inline header
2607 // function casting to f64 and calling `fmod`.
2608 if (Subtarget.is32Bit() &&
2609 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2610 // clang-format off
2611 for (ISD::NodeType Op :
2629 if (isOperationExpand(Op, MVT::f32))
2630 setOperationAction(Op, MVT::f32, Promote);
2631 // clang-format on
2632
2633 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2634 // it, but it's just a wrapper around ldexp.
2635 if (Subtarget.isOSWindows()) {
2637 if (isOperationExpand(Op, MVT::f32))
2638 setOperationAction(Op, MVT::f32, Promote);
2639 }
2640
2641 // We have target-specific dag combine patterns for the following nodes:
2652 ISD::SHL,
2653 ISD::SRA,
2654 ISD::SRL,
2655 ISD::OR,
2656 ISD::AND,
2662 ISD::ADD,
2663 ISD::FADD,
2664 ISD::FSUB,
2665 ISD::FNEG,
2666 ISD::FMA,
2670 ISD::SUB,
2671 ISD::LOAD,
2672 ISD::LRINT,
2674 ISD::MLOAD,
2675 ISD::STORE,
2691 ISD::SETCC,
2692 ISD::MUL,
2693 ISD::XOR,
2704
2706
2707 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2709 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2711 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2713
2714 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2715 // that needs to benchmarked and balanced with the potential use of vector
2716 // load/store types (PR33329, PR33914).
2719
2720 // Default loop alignment, which can be overridden by -align-loops.
2722
2723 // An out-of-order CPU can speculatively execute past a predictable branch,
2724 // but a conditional move could be stalled by an expensive earlier operation.
2725 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2726 EnableExtLdPromotion = true;
2728
2730
2731 // Default to having -disable-strictnode-mutation on
2732 IsStrictFPEnabled = true;
2733}
2734
2735// This has so far only been implemented for 64-bit MachO.
2737 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2738}
2739
2741 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2742 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2743}
2744
2746 const SDLoc &DL) const {
2747 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2748 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2749 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2750 return SDValue(Node, 0);
2751}
2752
2755 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2756 !Subtarget.hasBWI())
2757 return TypeSplitVector;
2758
2759 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2760 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2761 return TypeSplitVector;
2762
2763 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2764 VT.getVectorElementType() != MVT::i1)
2765 return TypeWidenVector;
2766
2768}
2769
2770FastISel *
2772 const TargetLibraryInfo *libInfo) const {
2773 return X86::createFastISel(funcInfo, libInfo);
2774}
2775
2776//===----------------------------------------------------------------------===//
2777// Other Lowering Hooks
2778//===----------------------------------------------------------------------===//
2779
2781 bool AssumeSingleUse) {
2782 if (!AssumeSingleUse && !Op.hasOneUse())
2783 return false;
2784 if (!ISD::isNormalLoad(Op.getNode()))
2785 return false;
2786
2787 // If this is an unaligned vector, make sure the target supports folding it.
2788 auto *Ld = cast<LoadSDNode>(Op.getNode());
2789 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2790 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2791 return false;
2792
2793 // TODO: If this is a non-temporal load and the target has an instruction
2794 // for it, it should not be folded. See "useNonTemporalLoad()".
2795
2796 return true;
2797}
2798
2800 const X86Subtarget &Subtarget,
2801 bool AssumeSingleUse) {
2802 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2803 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2804 return false;
2805
2806 // We can not replace a wide volatile load with a broadcast-from-memory,
2807 // because that would narrow the load, which isn't legal for volatiles.
2808 auto *Ld = cast<LoadSDNode>(Op.getNode());
2809 return !Ld->isVolatile() ||
2810 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2811}
2812
2814 if (!Op.hasOneUse())
2815 return false;
2816 // Peek through (oneuse) bitcast users
2817 SDNode *User = *Op->user_begin();
2818 while (User->getOpcode() == ISD::BITCAST) {
2819 if (!User->hasOneUse())
2820 return false;
2821 User = *User->user_begin();
2822 }
2823 return ISD::isNormalStore(User);
2824}
2825
2827 if (Op.hasOneUse()) {
2828 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2829 return (ISD::ZERO_EXTEND == Opcode);
2830 }
2831 return false;
2832}
2833
2834static bool isLogicOp(unsigned Opcode) {
2835 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2836 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2837}
2838
2839static bool isTargetShuffle(unsigned Opcode) {
2840 switch(Opcode) {
2841 default: return false;
2842 case X86ISD::BLENDI:
2843 case X86ISD::PSHUFB:
2844 case X86ISD::PSHUFD:
2845 case X86ISD::PSHUFHW:
2846 case X86ISD::PSHUFLW:
2847 case X86ISD::SHUFP:
2848 case X86ISD::INSERTPS:
2849 case X86ISD::EXTRQI:
2850 case X86ISD::INSERTQI:
2851 case X86ISD::VALIGN:
2852 case X86ISD::PALIGNR:
2853 case X86ISD::VSHLDQ:
2854 case X86ISD::VSRLDQ:
2855 case X86ISD::MOVLHPS:
2856 case X86ISD::MOVHLPS:
2857 case X86ISD::MOVSHDUP:
2858 case X86ISD::MOVSLDUP:
2859 case X86ISD::MOVDDUP:
2860 case X86ISD::MOVSS:
2861 case X86ISD::MOVSD:
2862 case X86ISD::MOVSH:
2863 case X86ISD::UNPCKL:
2864 case X86ISD::UNPCKH:
2865 case X86ISD::VBROADCAST:
2866 case X86ISD::VPERMILPI:
2867 case X86ISD::VPERMILPV:
2868 case X86ISD::VPERM2X128:
2869 case X86ISD::SHUF128:
2870 case X86ISD::VPERMIL2:
2871 case X86ISD::VPERMI:
2872 case X86ISD::VPPERM:
2873 case X86ISD::VPERMV:
2874 case X86ISD::VPERMV3:
2875 case X86ISD::VZEXT_MOVL:
2876 return true;
2877 }
2878}
2879
2880static bool isTargetShuffleVariableMask(unsigned Opcode) {
2881 switch (Opcode) {
2882 default: return false;
2883 // Target Shuffles.
2884 case X86ISD::PSHUFB:
2885 case X86ISD::VPERMILPV:
2886 case X86ISD::VPERMIL2:
2887 case X86ISD::VPPERM:
2888 case X86ISD::VPERMV:
2889 case X86ISD::VPERMV3:
2890 return true;
2891 // 'Faux' Target Shuffles.
2892 case ISD::OR:
2893 case ISD::AND:
2894 case X86ISD::ANDNP:
2895 return true;
2896 }
2897}
2898
2901 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2903 int ReturnAddrIndex = FuncInfo->getRAIndex();
2904
2905 if (ReturnAddrIndex == 0) {
2906 // Set up a frame object for the return address.
2907 unsigned SlotSize = RegInfo->getSlotSize();
2908 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2909 -(int64_t)SlotSize,
2910 false);
2911 FuncInfo->setRAIndex(ReturnAddrIndex);
2912 }
2913
2914 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2915}
2916
2918 bool HasSymbolicDisplacement) {
2919 // Offset should fit into 32 bit immediate field.
2920 if (!isInt<32>(Offset))
2921 return false;
2922
2923 // If we don't have a symbolic displacement - we don't have any extra
2924 // restrictions.
2925 if (!HasSymbolicDisplacement)
2926 return true;
2927
2928 // We can fold large offsets in the large code model because we always use
2929 // 64-bit offsets.
2930 if (CM == CodeModel::Large)
2931 return true;
2932
2933 // For kernel code model we know that all object resist in the negative half
2934 // of 32bits address space. We may not accept negative offsets, since they may
2935 // be just off and we may accept pretty large positive ones.
2936 if (CM == CodeModel::Kernel)
2937 return Offset >= 0;
2938
2939 // For other non-large code models we assume that latest small object is 16MB
2940 // before end of 31 bits boundary. We may also accept pretty large negative
2941 // constants knowing that all objects are in the positive half of address
2942 // space.
2943 return Offset < 16 * 1024 * 1024;
2944}
2945
2946/// Return true if the condition is an signed comparison operation.
2947static bool isX86CCSigned(unsigned X86CC) {
2948 switch (X86CC) {
2949 default:
2950 llvm_unreachable("Invalid integer condition!");
2951 case X86::COND_E:
2952 case X86::COND_NE:
2953 case X86::COND_B:
2954 case X86::COND_A:
2955 case X86::COND_BE:
2956 case X86::COND_AE:
2957 return false;
2958 case X86::COND_G:
2959 case X86::COND_GE:
2960 case X86::COND_L:
2961 case X86::COND_LE:
2962 return true;
2963 }
2964}
2965
2967 switch (SetCCOpcode) {
2968 // clang-format off
2969 default: llvm_unreachable("Invalid integer condition!");
2970 case ISD::SETEQ: return X86::COND_E;
2971 case ISD::SETGT: return X86::COND_G;
2972 case ISD::SETGE: return X86::COND_GE;
2973 case ISD::SETLT: return X86::COND_L;
2974 case ISD::SETLE: return X86::COND_LE;
2975 case ISD::SETNE: return X86::COND_NE;
2976 case ISD::SETULT: return X86::COND_B;
2977 case ISD::SETUGT: return X86::COND_A;
2978 case ISD::SETULE: return X86::COND_BE;
2979 case ISD::SETUGE: return X86::COND_AE;
2980 // clang-format on
2981 }
2982}
2983
2984/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2985/// condition code, returning the condition code and the LHS/RHS of the
2986/// comparison to make.
2988 bool isFP, SDValue &LHS, SDValue &RHS,
2989 SelectionDAG &DAG) {
2990 if (!isFP) {
2991 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2992 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2993 // X > -1 -> X == 0, jump !sign.
2994 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2995 return X86::COND_NS;
2996 }
2997 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2998 // X < 0 -> X == 0, jump on sign.
2999 return X86::COND_S;
3000 }
3001 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3002 // X >= 0 -> X == 0, jump on !sign.
3003 return X86::COND_NS;
3004 }
3005 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3006 // X < 1 -> X <= 0
3007 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3008 return X86::COND_LE;
3009 }
3010 }
3011
3012 return TranslateIntegerX86CC(SetCCOpcode);
3013 }
3014
3015 // First determine if it is required or is profitable to flip the operands.
3016
3017 // If LHS is a foldable load, but RHS is not, flip the condition.
3018 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3019 !ISD::isNON_EXTLoad(RHS.getNode())) {
3020 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3021 std::swap(LHS, RHS);
3022 }
3023
3024 switch (SetCCOpcode) {
3025 default: break;
3026 case ISD::SETOLT:
3027 case ISD::SETOLE:
3028 case ISD::SETUGT:
3029 case ISD::SETUGE:
3030 std::swap(LHS, RHS);
3031 break;
3032 }
3033
3034 // On a floating point condition, the flags are set as follows:
3035 // ZF PF CF op
3036 // 0 | 0 | 0 | X > Y
3037 // 0 | 0 | 1 | X < Y
3038 // 1 | 0 | 0 | X == Y
3039 // 1 | 1 | 1 | unordered
3040 switch (SetCCOpcode) {
3041 // clang-format off
3042 default: llvm_unreachable("Condcode should be pre-legalized away");
3043 case ISD::SETUEQ:
3044 case ISD::SETEQ: return X86::COND_E;
3045 case ISD::SETOLT: // flipped
3046 case ISD::SETOGT:
3047 case ISD::SETGT: return X86::COND_A;
3048 case ISD::SETOLE: // flipped
3049 case ISD::SETOGE:
3050 case ISD::SETGE: return X86::COND_AE;
3051 case ISD::SETUGT: // flipped
3052 case ISD::SETULT:
3053 case ISD::SETLT: return X86::COND_B;
3054 case ISD::SETUGE: // flipped
3055 case ISD::SETULE:
3056 case ISD::SETLE: return X86::COND_BE;
3057 case ISD::SETONE:
3058 case ISD::SETNE: return X86::COND_NE;
3059 case ISD::SETUO: return X86::COND_P;
3060 case ISD::SETO: return X86::COND_NP;
3061 case ISD::SETOEQ:
3062 case ISD::SETUNE: return X86::COND_INVALID;
3063 // clang-format on
3064 }
3065}
3066
3067/// Is there a floating point cmov for the specific X86 condition code?
3068/// Current x86 isa includes the following FP cmov instructions:
3069/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3070static bool hasFPCMov(unsigned X86CC) {
3071 switch (X86CC) {
3072 default:
3073 return false;
3074 case X86::COND_B:
3075 case X86::COND_BE:
3076 case X86::COND_E:
3077 case X86::COND_P:
3078 case X86::COND_A:
3079 case X86::COND_AE:
3080 case X86::COND_NE:
3081 case X86::COND_NP:
3082 return true;
3083 }
3084}
3085
3086static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3087 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3088 VT.is512BitVector();
3089}
3090
3092 const CallInst &I,
3093 MachineFunction &MF,
3094 unsigned Intrinsic) const {
3096 Info.offset = 0;
3097
3098 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3099 if (!IntrData) {
3100 switch (Intrinsic) {
3101 case Intrinsic::x86_aesenc128kl:
3102 case Intrinsic::x86_aesdec128kl:
3104 Info.ptrVal = I.getArgOperand(1);
3105 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3106 Info.align = Align(1);
3108 return true;
3109 case Intrinsic::x86_aesenc256kl:
3110 case Intrinsic::x86_aesdec256kl:
3112 Info.ptrVal = I.getArgOperand(1);
3113 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3114 Info.align = Align(1);
3116 return true;
3117 case Intrinsic::x86_aesencwide128kl:
3118 case Intrinsic::x86_aesdecwide128kl:
3120 Info.ptrVal = I.getArgOperand(0);
3121 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3122 Info.align = Align(1);
3124 return true;
3125 case Intrinsic::x86_aesencwide256kl:
3126 case Intrinsic::x86_aesdecwide256kl:
3128 Info.ptrVal = I.getArgOperand(0);
3129 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3130 Info.align = Align(1);
3132 return true;
3133 case Intrinsic::x86_cmpccxadd32:
3134 case Intrinsic::x86_cmpccxadd64:
3135 case Intrinsic::x86_atomic_bts:
3136 case Intrinsic::x86_atomic_btc:
3137 case Intrinsic::x86_atomic_btr: {
3139 Info.ptrVal = I.getArgOperand(0);
3140 unsigned Size = I.getType()->getScalarSizeInBits();
3141 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3142 Info.align = Align(Size);
3145 return true;
3146 }
3147 case Intrinsic::x86_atomic_bts_rm:
3148 case Intrinsic::x86_atomic_btc_rm:
3149 case Intrinsic::x86_atomic_btr_rm: {
3151 Info.ptrVal = I.getArgOperand(0);
3152 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3153 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3154 Info.align = Align(Size);
3157 return true;
3158 }
3159 case Intrinsic::x86_aadd32:
3160 case Intrinsic::x86_aadd64:
3161 case Intrinsic::x86_aand32:
3162 case Intrinsic::x86_aand64:
3163 case Intrinsic::x86_aor32:
3164 case Intrinsic::x86_aor64:
3165 case Intrinsic::x86_axor32:
3166 case Intrinsic::x86_axor64:
3167 case Intrinsic::x86_atomic_add_cc:
3168 case Intrinsic::x86_atomic_sub_cc:
3169 case Intrinsic::x86_atomic_or_cc:
3170 case Intrinsic::x86_atomic_and_cc:
3171 case Intrinsic::x86_atomic_xor_cc: {
3173 Info.ptrVal = I.getArgOperand(0);
3174 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3175 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3176 Info.align = Align(Size);
3179 return true;
3180 }
3181 }
3182 return false;
3183 }
3184
3185 switch (IntrData->Type) {
3188 case TRUNCATE_TO_MEM_VI32: {
3190 Info.ptrVal = I.getArgOperand(0);
3191 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3193 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3194 ScalarVT = MVT::i8;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3196 ScalarVT = MVT::i16;
3197 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3198 ScalarVT = MVT::i32;
3199
3200 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3201 Info.align = Align(1);
3203 break;
3204 }
3205 case GATHER:
3206 case GATHER_AVX2: {
3208 Info.ptrVal = nullptr;
3209 MVT DataVT = MVT::getVT(I.getType());
3210 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3211 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3212 IndexVT.getVectorNumElements());
3213 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3214 Info.align = Align(1);
3216 break;
3217 }
3218 case SCATTER: {
3220 Info.ptrVal = nullptr;
3221 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3222 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3223 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3224 IndexVT.getVectorNumElements());
3225 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3226 Info.align = Align(1);
3228 break;
3229 }
3230 default:
3231 return false;
3232 }
3233
3234 return true;
3235}
3236
3237/// Returns true if the target can instruction select the
3238/// specified FP immediate natively. If false, the legalizer will
3239/// materialize the FP immediate as a load from a constant pool.
3241 bool ForCodeSize) const {
3242 for (const APFloat &FPImm : LegalFPImmediates)
3243 if (Imm.bitwiseIsEqual(FPImm))
3244 return true;
3245 return false;
3246}
3247
3249 ISD::LoadExtType ExtTy,
3250 EVT NewVT) const {
3251 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3252
3253 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3254 // relocation target a movq or addq instruction: don't let the load shrink.
3255 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3256 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3257 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3258 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3259
3260 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3261 // those uses are extracted directly into a store, then the extract + store
3262 // can be store-folded. Therefore, it's probably not worth splitting the load.
3263 EVT VT = Load->getValueType(0);
3264 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3265 for (SDUse &Use : Load->uses()) {
3266 // Skip uses of the chain value. Result 0 of the node is the load value.
3267 if (Use.getResNo() != 0)
3268 continue;
3269
3270 SDNode *User = Use.getUser();
3271
3272 // If this use is not an extract + store, it's probably worth splitting.
3273 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3274 User->user_begin()->getOpcode() != ISD::STORE)
3275 return true;
3276 }
3277 // All non-chain uses are extract + store.
3278 return false;
3279 }
3280
3281 return true;
3282}
3283
3284/// Returns true if it is beneficial to convert a load of a constant
3285/// to just the constant itself.
3287 Type *Ty) const {
3288 assert(Ty->isIntegerTy());
3289
3290 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3291 if (BitSize == 0 || BitSize > 64)
3292 return false;
3293 return true;
3294}
3295
3297 // If we are using XMM registers in the ABI and the condition of the select is
3298 // a floating-point compare and we have blendv or conditional move, then it is
3299 // cheaper to select instead of doing a cross-register move and creating a
3300 // load that depends on the compare result.
3301 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3302 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3303}
3304
3306 // TODO: It might be a win to ease or lift this restriction, but the generic
3307 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3308 if (VT.isVector() && Subtarget.hasAVX512())
3309 return false;
3310
3311 return true;
3312}
3313
3315 SDValue C) const {
3316 // TODO: We handle scalars using custom code, but generic combining could make
3317 // that unnecessary.
3318 APInt MulC;
3319 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3320 return false;
3321
3322 // Find the type this will be legalized too. Otherwise we might prematurely
3323 // convert this to shl+add/sub and then still have to type legalize those ops.
3324 // Another choice would be to defer the decision for illegal types until
3325 // after type legalization. But constant splat vectors of i64 can't make it
3326 // through type legalization on 32-bit targets so we would need to special
3327 // case vXi64.
3328 while (getTypeAction(Context, VT) != TypeLegal)
3329 VT = getTypeToTransformTo(Context, VT);
3330
3331 // If vector multiply is legal, assume that's faster than shl + add/sub.
3332 // Multiply is a complex op with higher latency and lower throughput in
3333 // most implementations, sub-vXi32 vector multiplies are always fast,
3334 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3335 // is always going to be slow.
3336 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3337 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3338 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3339 return false;
3340
3341 // shl+add, shl+sub, shl+add+neg
3342 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3343 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3344}
3345
3347 unsigned Index) const {
3349 return false;
3350
3351 // Mask vectors support all subregister combinations and operations that
3352 // extract half of vector.
3353 if (ResVT.getVectorElementType() == MVT::i1)
3354 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3355 (Index == ResVT.getVectorNumElements()));
3356
3357 return (Index % ResVT.getVectorNumElements()) == 0;
3358}
3359
3361 unsigned Opc = VecOp.getOpcode();
3362
3363 // Assume target opcodes can't be scalarized.
3364 // TODO - do we have any exceptions?
3365 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3366 return false;
3367
3368 // If the vector op is not supported, try to convert to scalar.
3369 EVT VecVT = VecOp.getValueType();
3370 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3371 return true;
3372
3373 // If the vector op is supported, but the scalar op is not, the transform may
3374 // not be worthwhile.
3375 EVT ScalarVT = VecVT.getScalarType();
3376 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3377}
3378
3380 bool) const {
3381 // TODO: Allow vectors?
3382 if (VT.isVector())
3383 return false;
3384 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3385}
3386
3388 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3389 // i32/i64 or can rely on BSF passthrough value.
3390 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3391 Subtarget.hasBitScanPassThrough() ||
3392 (!Ty->isVectorTy() &&
3393 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3394}
3395
3397 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3398 // passthrough value.
3399 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3400 Subtarget.hasBitScanPassThrough();
3401}
3402
3404 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3405 // expensive than a straight movsd. On the other hand, it's important to
3406 // shrink long double fp constant since fldt is very slow.
3407 return !Subtarget.hasSSE2() || VT == MVT::f80;
3408}
3409
3411 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3412 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3413}
3414
3416 const SelectionDAG &DAG,
3417 const MachineMemOperand &MMO) const {
3418 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3419 BitcastVT.getVectorElementType() == MVT::i1)
3420 return false;
3421
3422 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3423 return false;
3424
3425 // If both types are legal vectors, it's always ok to convert them.
3426 if (LoadVT.isVector() && BitcastVT.isVector() &&
3427 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3428 return true;
3429
3430 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3431}
3432
3434 const MachineFunction &MF) const {
3435 // Do not merge to float value size (128 bytes) if no implicit
3436 // float attribute is set.
3437 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3438
3439 if (NoFloat) {
3440 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3441 return (MemVT.getSizeInBits() <= MaxIntSize);
3442 }
3443 // Make sure we don't merge greater than our preferred vector
3444 // width.
3445 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3446 return false;
3447
3448 return true;
3449}
3450
3452 return Subtarget.hasFastLZCNT();
3453}
3454
3456 const Instruction &AndI) const {
3457 return true;
3458}
3459
3461 EVT VT = Y.getValueType();
3462
3463 if (VT.isVector())
3464 return false;
3465
3466 if (!Subtarget.hasBMI())
3467 return false;
3468
3469 // There are only 32-bit and 64-bit forms for 'andn'.
3470 if (VT != MVT::i32 && VT != MVT::i64)
3471 return false;
3472
3473 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3474}
3475
3477 EVT VT = Y.getValueType();
3478
3479 if (!VT.isVector())
3480 return hasAndNotCompare(Y);
3481
3482 // Vector.
3483
3484 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3485 return false;
3486
3487 if (VT == MVT::v4i32)
3488 return true;
3489
3490 return Subtarget.hasSSE2();
3491}
3492
3494 return X.getValueType().isScalarInteger(); // 'bt'
3495}
3496
3500 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3501 SelectionDAG &DAG) const {
3502 // Does baseline recommend not to perform the fold by default?
3504 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3505 return false;
3506 // For scalars this transform is always beneficial.
3507 if (X.getValueType().isScalarInteger())
3508 return true;
3509 // If all the shift amounts are identical, then transform is beneficial even
3510 // with rudimentary SSE2 shifts.
3511 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3512 return true;
3513 // If we have AVX2 with it's powerful shift operations, then it's also good.
3514 if (Subtarget.hasAVX2())
3515 return true;
3516 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3517 return NewShiftOpcode == ISD::SHL;
3518}
3519
3521 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3522 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3523 if (!VT.isInteger())
3524 return ShiftOpc;
3525
3526 bool PreferRotate = false;
3527 if (VT.isVector()) {
3528 // For vectors, if we have rotate instruction support, then its definetly
3529 // best. Otherwise its not clear what the best so just don't make changed.
3530 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3531 VT.getScalarType() == MVT::i64);
3532 } else {
3533 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3534 // rotate unless we have a zext mask+shr.
3535 PreferRotate = Subtarget.hasBMI2();
3536 if (!PreferRotate) {
3537 unsigned MaskBits =
3538 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3539 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3540 }
3541 }
3542
3543 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3544 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3545
3546 if (PreferRotate && MayTransformRotate)
3547 return ISD::ROTL;
3548
3549 // If vector we don't really get much benefit swapping around constants.
3550 // Maybe we could check if the DAG has the flipped node already in the
3551 // future.
3552 if (VT.isVector())
3553 return ShiftOpc;
3554
3555 // See if the beneficial to swap shift type.
3556 if (ShiftOpc == ISD::SHL) {
3557 // If the current setup has imm64 mask, then inverse will have
3558 // at least imm32 mask (or be zext i32 -> i64).
3559 if (VT == MVT::i64)
3560 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3561 : ShiftOpc;
3562
3563 // We can only benefit if req at least 7-bit for the mask. We
3564 // don't want to replace shl of 1,2,3 as they can be implemented
3565 // with lea/add.
3566 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3567 }
3568
3569 if (VT == MVT::i64)
3570 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3571 // extremely efficient.
3572 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3573
3574 // Keep small shifts as shl so we can generate add/lea.
3575 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3576 }
3577
3578 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3579 // (PreferRotate will be set in the latter case).
3580 if (PreferRotate || !MayTransformRotate || VT.isVector())
3581 return ShiftOpc;
3582
3583 // Non-vector type and we have a zext mask with SRL.
3584 return ISD::SRL;
3585}
3586
3589 const Value *Lhs,
3590 const Value *Rhs) const {
3591 using namespace llvm::PatternMatch;
3592 int BaseCost = BrMergingBaseCostThresh.getValue();
3593 // With CCMP, branches can be merged in a more efficient way.
3594 if (BaseCost >= 0 && Subtarget.hasCCMP())
3595 BaseCost += BrMergingCcmpBias;
3596 // a == b && a == c is a fast pattern on x86.
3597 if (BaseCost >= 0 && Opc == Instruction::And &&
3600 BaseCost += 1;
3601 return {BaseCost, BrMergingLikelyBias.getValue(),
3602 BrMergingUnlikelyBias.getValue()};
3603}
3604
3606 return N->getOpcode() != ISD::FP_EXTEND;
3607}
3608
3610 const SDNode *N, CombineLevel Level) const {
3611 assert(((N->getOpcode() == ISD::SHL &&
3612 N->getOperand(0).getOpcode() == ISD::SRL) ||
3613 (N->getOpcode() == ISD::SRL &&
3614 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3615 "Expected shift-shift mask");
3616 // TODO: Should we always create i64 masks? Or only folded immediates?
3617 EVT VT = N->getValueType(0);
3618 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3619 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3620 // Only fold if the shift values are equal - so it folds to AND.
3621 // TODO - we should fold if either is a non-uniform vector but we don't do
3622 // the fold for non-splats yet.
3623 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3624 }
3626}
3627
3629 EVT VT = Y.getValueType();
3630
3631 // For vectors, we don't have a preference, but we probably want a mask.
3632 if (VT.isVector())
3633 return false;
3634
3635 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3636 if (VT == MVT::i64 && !Subtarget.is64Bit())
3637 return false;
3638
3639 return true;
3640}
3641
3644 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3646 !Subtarget.isOSWindows())
3649 ExpansionFactor);
3650}
3651
3653 // Any legal vector type can be splatted more efficiently than
3654 // loading/spilling from memory.
3655 return isTypeLegal(VT);
3656}
3657
3659 MVT VT = MVT::getIntegerVT(NumBits);
3660 if (isTypeLegal(VT))
3661 return VT;
3662
3663 // PMOVMSKB can handle this.
3664 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3665 return MVT::v16i8;
3666
3667 // VPMOVMSKB can handle this.
3668 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3669 return MVT::v32i8;
3670
3671 // TODO: Allow 64-bit type for 32-bit target.
3672 // TODO: 512-bit types should be allowed, but make sure that those
3673 // cases are handled in combineVectorSizedSetCCEquality().
3674
3676}
3677
3678/// Val is the undef sentinel value or equal to the specified value.
3679static bool isUndefOrEqual(int Val, int CmpVal) {
3680 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3681}
3682
3683/// Return true if every element in Mask is the undef sentinel value or equal to
3684/// the specified value.
3685static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3686 return llvm::all_of(Mask, [CmpVal](int M) {
3687 return (M == SM_SentinelUndef) || (M == CmpVal);
3688 });
3689}
3690
3691/// Return true if every element in Mask, beginning from position Pos and ending
3692/// in Pos+Size is the undef sentinel value or equal to the specified value.
3693static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3694 unsigned Size) {
3695 return llvm::all_of(Mask.slice(Pos, Size),
3696 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3697}
3698
3699/// Val is either the undef or zero sentinel value.
3700static bool isUndefOrZero(int Val) {
3701 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3702}
3703
3704/// Return true if every element in Mask, beginning from position Pos and ending
3705/// in Pos+Size is the undef sentinel value.
3706static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3707 return llvm::all_of(Mask.slice(Pos, Size),
3708 [](int M) { return M == SM_SentinelUndef; });
3709}
3710
3711/// Return true if the mask creates a vector whose lower half is undefined.
3713 unsigned NumElts = Mask.size();
3714 return isUndefInRange(Mask, 0, NumElts / 2);
3715}
3716
3717/// Return true if the mask creates a vector whose upper half is undefined.
3719 unsigned NumElts = Mask.size();
3720 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3721}
3722
3723/// Return true if Val falls within the specified range (L, H].
3724static bool isInRange(int Val, int Low, int Hi) {
3725 return (Val >= Low && Val < Hi);
3726}
3727
3728/// Return true if the value of any element in Mask falls within the specified
3729/// range (L, H].
3730static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3731 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3732}
3733
3734/// Return true if the value of any element in Mask is the zero sentinel value.
3735static bool isAnyZero(ArrayRef<int> Mask) {
3736 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3737}
3738
3739/// Return true if Val is undef or if its value falls within the
3740/// specified range (L, H].
3741static bool isUndefOrInRange(int Val, int Low, int Hi) {
3742 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3743}
3744
3745/// Return true if every element in Mask is undef or if its value
3746/// falls within the specified range (L, H].
3747static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3748 return llvm::all_of(
3749 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3750}
3751
3752/// Return true if Val is undef, zero or if its value falls within the
3753/// specified range (L, H].
3754static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3755 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3756}
3757
3758/// Return true if every element in Mask is undef, zero or if its value
3759/// falls within the specified range (L, H].
3760static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3761 return llvm::all_of(
3762 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3763}
3764
3765/// Return true if every element in Mask, is an in-place blend/select mask or is
3766/// undef.
3768 unsigned NumElts = Mask.size();
3769 for (auto [I, M] : enumerate(Mask))
3770 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3771 return false;
3772 return true;
3773}
3774
3775/// Return true if every element in Mask, beginning
3776/// from position Pos and ending in Pos + Size, falls within the specified
3777/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3778static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3779 unsigned Size, int Low, int Step = 1) {
3780 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3781 if (!isUndefOrEqual(Mask[i], Low))
3782 return false;
3783 return true;
3784}
3785
3786/// Return true if every element in Mask, beginning
3787/// from position Pos and ending in Pos+Size, falls within the specified
3788/// sequential range (Low, Low+Size], or is undef or is zero.
3790 unsigned Size, int Low,
3791 int Step = 1) {
3792 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3793 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3794 return false;
3795 return true;
3796}
3797
3798/// Return true if every element in Mask, beginning
3799/// from position Pos and ending in Pos+Size is undef or is zero.
3800static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3801 unsigned Size) {
3802 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3803}
3804
3805/// Return true if every element of a single input is referenced by the shuffle
3806/// mask. i.e. it just permutes them all.
3808 unsigned NumElts = Mask.size();
3809 APInt DemandedElts = APInt::getZero(NumElts);
3810 for (int M : Mask)
3811 if (isInRange(M, 0, NumElts))
3812 DemandedElts.setBit(M);
3813 return DemandedElts.isAllOnes();
3814}
3815
3816/// Helper function to test whether a shuffle mask could be
3817/// simplified by widening the elements being shuffled.
3818///
3819/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3820/// leaves it in an unspecified state.
3821///
3822/// NOTE: This must handle normal vector shuffle masks and *target* vector
3823/// shuffle masks. The latter have the special property of a '-2' representing
3824/// a zero-ed lane of a vector.
3826 SmallVectorImpl<int> &WidenedMask) {
3827 WidenedMask.assign(Mask.size() / 2, 0);
3828 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3829 int M0 = Mask[i];
3830 int M1 = Mask[i + 1];
3831
3832 // If both elements are undef, its trivial.
3833 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3834 WidenedMask[i / 2] = SM_SentinelUndef;
3835 continue;
3836 }
3837
3838 // Check for an undef mask and a mask value properly aligned to fit with
3839 // a pair of values. If we find such a case, use the non-undef mask's value.
3840 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3841 WidenedMask[i / 2] = M1 / 2;
3842 continue;
3843 }
3844 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3845 WidenedMask[i / 2] = M0 / 2;
3846 continue;
3847 }
3848
3849 // When zeroing, we need to spread the zeroing across both lanes to widen.
3850 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3851 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3853 WidenedMask[i / 2] = SM_SentinelZero;
3854 continue;
3855 }
3856 return false;
3857 }
3858
3859 // Finally check if the two mask values are adjacent and aligned with
3860 // a pair.
3861 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3862 WidenedMask[i / 2] = M0 / 2;
3863 continue;
3864 }
3865
3866 // Otherwise we can't safely widen the elements used in this shuffle.
3867 return false;
3868 }
3869 assert(WidenedMask.size() == Mask.size() / 2 &&
3870 "Incorrect size of mask after widening the elements!");
3871
3872 return true;
3873}
3874
3876 const APInt &Zeroable,
3877 bool V2IsZero,
3878 SmallVectorImpl<int> &WidenedMask) {
3879 // Create an alternative mask with info about zeroable elements.
3880 // Here we do not set undef elements as zeroable.
3881 SmallVector<int, 64> ZeroableMask(Mask);
3882 if (V2IsZero) {
3883 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3884 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3885 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3886 ZeroableMask[i] = SM_SentinelZero;
3887 }
3888 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3889}
3890
3892 SmallVector<int, 32> WidenedMask;
3893 return canWidenShuffleElements(Mask, WidenedMask);
3894}
3895
3896// Attempt to narrow/widen shuffle mask until it matches the target number of
3897// elements.
3898static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3899 SmallVectorImpl<int> &ScaledMask) {
3900 unsigned NumSrcElts = Mask.size();
3901 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3902 "Illegal shuffle scale factor");
3903
3904 // Narrowing is guaranteed to work.
3905 if (NumDstElts >= NumSrcElts) {
3906 int Scale = NumDstElts / NumSrcElts;
3907 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3908 return true;
3909 }
3910
3911 // We have to repeat the widening until we reach the target size, but we can
3912 // split out the first widening as it sets up ScaledMask for us.
3913 if (canWidenShuffleElements(Mask, ScaledMask)) {
3914 while (ScaledMask.size() > NumDstElts) {
3915 SmallVector<int, 16> WidenedMask;
3916 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3917 return false;
3918 ScaledMask = std::move(WidenedMask);
3919 }
3920 return true;
3921 }
3922
3923 return false;
3924}
3925
3926static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3927 SmallVector<int, 32> ScaledMask;
3928 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3929}
3930
3931/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3933 return isNullConstant(Elt) || isNullFPConstant(Elt);
3934}
3935
3936// Build a vector of constants.
3937// Use an UNDEF node if MaskElt == -1.
3938// Split 64-bit constants in the 32-bit mode.
3940 const SDLoc &dl, bool IsMask = false) {
3941
3943 bool Split = false;
3944
3945 MVT ConstVecVT = VT;
3946 unsigned NumElts = VT.getVectorNumElements();
3947 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3948 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3949 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3950 Split = true;
3951 }
3952
3953 MVT EltVT = ConstVecVT.getVectorElementType();
3954 for (unsigned i = 0; i < NumElts; ++i) {
3955 bool IsUndef = Values[i] < 0 && IsMask;
3956 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3957 DAG.getConstant(Values[i], dl, EltVT);
3958 Ops.push_back(OpNode);
3959 if (Split)
3960 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3961 DAG.getConstant(0, dl, EltVT));
3962 }
3963 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3964 if (Split)
3965 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3966 return ConstsNode;
3967}
3968
3969static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3970 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3971 assert(Bits.size() == Undefs.getBitWidth() &&
3972 "Unequal constant and undef arrays");
3974 bool Split = false;
3975
3976 MVT ConstVecVT = VT;
3977 unsigned NumElts = VT.getVectorNumElements();
3978 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3979 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3980 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3981 Split = true;
3982 }
3983
3984 MVT EltVT = ConstVecVT.getVectorElementType();
3985 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3986 if (Undefs[i]) {
3987 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3988 continue;
3989 }
3990 const APInt &V = Bits[i];
3991 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3992 if (Split) {
3993 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3994 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3995 } else if (EltVT == MVT::f32) {
3997 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3998 } else if (EltVT == MVT::f64) {
4000 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4001 } else {
4002 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4003 }
4004 }
4005
4006 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4007 return DAG.getBitcast(VT, ConstsNode);
4008}
4009
4011 SelectionDAG &DAG, const SDLoc &dl) {
4012 APInt Undefs = APInt::getZero(Bits.size());
4013 return getConstVector(Bits, Undefs, VT, DAG, dl);
4014}
4015
4016/// Returns a vector of specified type with all zero elements.
4017static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4018 SelectionDAG &DAG, const SDLoc &dl) {
4019 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4020 VT.getVectorElementType() == MVT::i1) &&
4021 "Unexpected vector type");
4022
4023 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4024 // type. This ensures they get CSE'd. But if the integer type is not
4025 // available, use a floating-point +0.0 instead.
4026 SDValue Vec;
4027 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4028 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4029 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4030 } else if (VT.isFloatingPoint() &&
4032 Vec = DAG.getConstantFP(+0.0, dl, VT);
4033 } else if (VT.getVectorElementType() == MVT::i1) {
4034 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4035 "Unexpected vector type");
4036 Vec = DAG.getConstant(0, dl, VT);
4037 } else {
4038 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4039 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4040 }
4041 return DAG.getBitcast(VT, Vec);
4042}
4043
4044// Helper to determine if the ops are all the extracted subvectors come from a
4045// single source. If we allow commute they don't have to be in order (Lo/Hi).
4046static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4047 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4048 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4049 LHS.getValueType() != RHS.getValueType() ||
4050 LHS.getOperand(0) != RHS.getOperand(0))
4051 return SDValue();
4052
4053 SDValue Src = LHS.getOperand(0);
4054 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4055 return SDValue();
4056
4057 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4058 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4059 RHS.getConstantOperandAPInt(1) == NumElts) ||
4060 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4061 LHS.getConstantOperandAPInt(1) == NumElts))
4062 return Src;
4063
4064 return SDValue();
4065}
4066
4067static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4068 const SDLoc &dl, unsigned vectorWidth) {
4069 EVT VT = Vec.getValueType();
4070 EVT ElVT = VT.getVectorElementType();
4071 unsigned Factor = VT.getSizeInBits() / vectorWidth;
4072 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4073 VT.getVectorNumElements() / Factor);
4074
4075 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4076 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4077 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4078
4079 // This is the index of the first element of the vectorWidth-bit chunk
4080 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4081 IdxVal &= ~(ElemsPerChunk - 1);
4082
4083 // If the input is a buildvector just emit a smaller one.
4084 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4085 return DAG.getBuildVector(ResultVT, dl,
4086 Vec->ops().slice(IdxVal, ElemsPerChunk));
4087
4088 // Check if we're extracting the upper undef of a widening pattern.
4089 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4090 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4091 isNullConstant(Vec.getOperand(2)))
4092 return DAG.getUNDEF(ResultVT);
4093
4094 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4095 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4096}
4097
4098/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4099/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4100/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4101/// instructions or a simple subregister reference. Idx is an index in the
4102/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4103/// lowering EXTRACT_VECTOR_ELT operations easier.
4104static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4105 SelectionDAG &DAG, const SDLoc &dl) {
4107 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4108 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4109}
4110
4111/// Generate a DAG to grab 256-bits from a 512-bit vector.
4112static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4113 SelectionDAG &DAG, const SDLoc &dl) {
4114 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4115 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4116}
4117
4118static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4119 SelectionDAG &DAG, const SDLoc &dl,
4120 unsigned vectorWidth) {
4121 assert((vectorWidth == 128 || vectorWidth == 256) &&
4122 "Unsupported vector width");
4123 // Inserting UNDEF is Result
4124 if (Vec.isUndef())
4125 return Result;
4126 EVT VT = Vec.getValueType();
4127 EVT ElVT = VT.getVectorElementType();
4128 EVT ResultVT = Result.getValueType();
4129
4130 // Insert the relevant vectorWidth bits.
4131 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4132 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4133
4134 // This is the index of the first element of the vectorWidth-bit chunk
4135 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4136 IdxVal &= ~(ElemsPerChunk - 1);
4137
4138 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4139 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4140}
4141
4142/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4143/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4144/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4145/// simple superregister reference. Idx is an index in the 128 bits
4146/// we want. It need not be aligned to a 128-bit boundary. That makes
4147/// lowering INSERT_VECTOR_ELT operations easier.
4148static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4149 SelectionDAG &DAG, const SDLoc &dl) {
4150 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4151 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Widen a vector to a larger size with the same scalar type, with the new
4155/// elements either zero or undef.
4156static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4157 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4158 const SDLoc &dl) {
4159 EVT VecVT = Vec.getValueType();
4161 VecVT.getScalarType() == VT.getScalarType() &&
4162 "Unsupported vector widening type");
4163 // If the upper 128-bits of a build vector are already undef/zero, then try to
4164 // widen from the lower 128-bits.
4165 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4166 unsigned NumSrcElts = VecVT.getVectorNumElements();
4167 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4168 if (all_of(Hi, [&](SDValue V) {
4169 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4170 }))
4171 Vec = extract128BitVector(Vec, 0, DAG, dl);
4172 }
4173 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4174 : DAG.getUNDEF(VT);
4175 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
4176 DAG.getVectorIdxConstant(0, dl));
4177}
4178
4179/// Widen a vector to a larger size with the same scalar type, with the new
4180/// elements either zero or undef.
4181static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4182 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4183 const SDLoc &dl, unsigned WideSizeInBits) {
4184 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4185 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4186 "Unsupported vector widening type");
4187 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4188 MVT SVT = Vec.getSimpleValueType().getScalarType();
4189 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4190 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4191}
4192
4193/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4194/// and bitcast with integer types.
4195static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4196 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4197 unsigned NumElts = VT.getVectorNumElements();
4198 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4199 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4200 return VT;
4201}
4202
4203/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4204/// bitcast with integer types.
4205static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4206 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4207 const SDLoc &dl) {
4208 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4209 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4210}
4211
4212// Helper function to collect subvector ops that are concatenated together,
4213// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4214// The subvectors in Ops are guaranteed to be the same type.
4216 SelectionDAG &DAG) {
4217 assert(Ops.empty() && "Expected an empty ops vector");
4218
4219 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4220 Ops.append(N->op_begin(), N->op_end());
4221 return true;
4222 }
4223
4224 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4225 SDValue Src = N->getOperand(0);
4226 SDValue Sub = N->getOperand(1);
4227 const APInt &Idx = N->getConstantOperandAPInt(2);
4228 EVT VT = Src.getValueType();
4229 EVT SubVT = Sub.getValueType();
4230
4231 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4232 // insert_subvector(undef, x, lo)
4233 if (Idx == 0 && Src.isUndef()) {
4234 Ops.push_back(Sub);
4235 Ops.push_back(DAG.getUNDEF(SubVT));
4236 return true;
4237 }
4238 if (Idx == (VT.getVectorNumElements() / 2)) {
4239 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4240 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4241 Src.getOperand(1).getValueType() == SubVT &&
4242 isNullConstant(Src.getOperand(2))) {
4243 // Attempt to recurse into inner (matching) concats.
4244 SDValue Lo = Src.getOperand(1);
4245 SDValue Hi = Sub;
4246 SmallVector<SDValue, 2> LoOps, HiOps;
4247 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4248 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4249 LoOps.size() == HiOps.size()) {
4250 Ops.append(LoOps);
4251 Ops.append(HiOps);
4252 return true;
4253 }
4254 Ops.push_back(Lo);
4255 Ops.push_back(Hi);
4256 return true;
4257 }
4258 // insert_subvector(x, extract_subvector(x, lo), hi)
4259 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4260 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4261 Ops.append(2, Sub);
4262 return true;
4263 }
4264 // insert_subvector(undef, x, hi)
4265 if (Src.isUndef()) {
4266 Ops.push_back(DAG.getUNDEF(SubVT));
4267 Ops.push_back(Sub);
4268 return true;
4269 }
4270 }
4271 }
4272 }
4273
4274 return false;
4275}
4276
4277// Helper to check if \p V can be split into subvectors and the upper subvectors
4278// are all undef. In which case return the lower subvector.
4280 SelectionDAG &DAG) {
4281 SmallVector<SDValue> SubOps;
4282 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4283 return SDValue();
4284
4285 unsigned NumSubOps = SubOps.size();
4286 unsigned HalfNumSubOps = NumSubOps / 2;
4287 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4288
4289 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4290 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4291 return SDValue();
4292
4293 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4294 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4295 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4296}
4297
4298// Helper to check if we can access all the constituent subvectors without any
4299// extract ops.
4302 return collectConcatOps(N, Ops, DAG);
4303}
4304
4305static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4306 const SDLoc &dl) {
4307 EVT VT = Op.getValueType();
4308 unsigned NumElems = VT.getVectorNumElements();
4309 unsigned SizeInBits = VT.getSizeInBits();
4310 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4311 "Can't split odd sized vector");
4312
4313 // If this is a splat value (with no-undefs) then use the lower subvector,
4314 // which should be a free extraction.
4315 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4316 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4317 return std::make_pair(Lo, Lo);
4318
4319 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4320 return std::make_pair(Lo, Hi);
4321}
4322
4323/// Break an operation into 2 half sized ops and then concatenate the results.
4325 unsigned NumOps = Op.getNumOperands();
4326 EVT VT = Op.getValueType();
4327
4328 // Extract the LHS Lo/Hi vectors
4329 SmallVector<SDValue> LoOps(NumOps, SDValue());
4330 SmallVector<SDValue> HiOps(NumOps, SDValue());
4331 for (unsigned I = 0; I != NumOps; ++I) {
4332 SDValue SrcOp = Op.getOperand(I);
4333 if (!SrcOp.getValueType().isVector()) {
4334 LoOps[I] = HiOps[I] = SrcOp;
4335 continue;
4336 }
4337 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4338 }
4339
4340 EVT LoVT, HiVT;
4341 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4342 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4343 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4344 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4345}
4346
4347/// Break an unary integer operation into 2 half sized ops and then
4348/// concatenate the result back.
4350 const SDLoc &dl) {
4351 // Make sure we only try to split 256/512-bit types to avoid creating
4352 // narrow vectors.
4353 [[maybe_unused]] EVT VT = Op.getValueType();
4354 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4355 Op.getOperand(0).getValueType().is512BitVector()) &&
4356 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4357 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4358 VT.getVectorNumElements() &&
4359 "Unexpected VTs!");
4360 return splitVectorOp(Op, DAG, dl);
4361}
4362
4363/// Break a binary integer operation into 2 half sized ops and then
4364/// concatenate the result back.
4366 const SDLoc &dl) {
4367 // Assert that all the types match.
4368 [[maybe_unused]] EVT VT = Op.getValueType();
4369 assert(Op.getOperand(0).getValueType() == VT &&
4370 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4371 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4372 return splitVectorOp(Op, DAG, dl);
4373}
4374
4375// Helper for splitting operands of an operation to legal target size and
4376// apply a function on each part.
4377// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4378// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4379// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4380// The argument Builder is a function that will be applied on each split part:
4381// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4382template <typename F>
4384 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4385 F Builder, bool CheckBWI = true) {
4386 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4387 unsigned NumSubs = 1;
4388 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4389 (!CheckBWI && Subtarget.useAVX512Regs())) {
4390 if (VT.getSizeInBits() > 512) {
4391 NumSubs = VT.getSizeInBits() / 512;
4392 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4393 }
4394 } else if (Subtarget.hasAVX2()) {
4395 if (VT.getSizeInBits() > 256) {
4396 NumSubs = VT.getSizeInBits() / 256;
4397 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4398 }
4399 } else {
4400 if (VT.getSizeInBits() > 128) {
4401 NumSubs = VT.getSizeInBits() / 128;
4402 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4403 }
4404 }
4405
4406 if (NumSubs == 1)
4407 return Builder(DAG, DL, Ops);
4408
4410 for (unsigned i = 0; i != NumSubs; ++i) {
4412 for (SDValue Op : Ops) {
4413 EVT OpVT = Op.getValueType();
4414 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4415 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4416 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4417 }
4418 Subs.push_back(Builder(DAG, DL, SubOps));
4419 }
4420 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4421}
4422
4423// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4424// targets.
4425static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4427 const X86Subtarget &Subtarget) {
4428 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4429 MVT SVT = VT.getScalarType();
4430
4431 // If we have a 32/64 splatted constant, splat it to DstTy to
4432 // encourage a foldable broadcast'd operand.
4433 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4434 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4435 // AVX512 broadcasts 32/64-bit operands.
4436 // TODO: Support float once getAVX512Node is used by fp-ops.
4437 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4439 return SDValue();
4440 // If we're not widening, don't bother if we're not bitcasting.
4441 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4442 return SDValue();
4443 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4444 APInt SplatValue, SplatUndef;
4445 unsigned SplatBitSize;
4446 bool HasAnyUndefs;
4447 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4448 HasAnyUndefs, OpEltSizeInBits) &&
4449 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4450 return DAG.getConstant(SplatValue, DL, DstVT);
4451 }
4452 return SDValue();
4453 };
4454
4455 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4456
4457 MVT DstVT = VT;
4458 if (Widen)
4459 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4460
4461 // Canonicalize src operands.
4462 SmallVector<SDValue> SrcOps(Ops);
4463 for (SDValue &Op : SrcOps) {
4464 MVT OpVT = Op.getSimpleValueType();
4465 // Just pass through scalar operands.
4466 if (!OpVT.isVector())
4467 continue;
4468 assert(OpVT == VT && "Vector type mismatch");
4469
4470 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4471 Op = BroadcastOp;
4472 continue;
4473 }
4474
4475 // Just widen the subvector by inserting into an undef wide vector.
4476 if (Widen)
4477 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4478 }
4479
4480 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4481
4482 // Perform the 512-bit op then extract the bottom subvector.
4483 if (Widen)
4484 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4485 return Res;
4486}
4487
4488/// Insert i1-subvector to i1-vector.
4490 const X86Subtarget &Subtarget) {
4491
4492 SDLoc dl(Op);
4493 SDValue Vec = Op.getOperand(0);
4494 SDValue SubVec = Op.getOperand(1);
4495 SDValue Idx = Op.getOperand(2);
4496 unsigned IdxVal = Op.getConstantOperandVal(2);
4497
4498 // Inserting undef is a nop. We can just return the original vector.
4499 if (SubVec.isUndef())
4500 return Vec;
4501
4502 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4503 return Op;
4504
4505 MVT OpVT = Op.getSimpleValueType();
4506 unsigned NumElems = OpVT.getVectorNumElements();
4507 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4508
4509 // Extend to natively supported kshift.
4510 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4511
4512 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4513 // if necessary.
4514 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4515 // May need to promote to a legal type.
4516 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4517 DAG.getConstant(0, dl, WideOpVT),
4518 SubVec, Idx);
4519 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4520 }
4521
4522 MVT SubVecVT = SubVec.getSimpleValueType();
4523 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4524 assert(IdxVal + SubVecNumElems <= NumElems &&
4525 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4526 "Unexpected index value in INSERT_SUBVECTOR");
4527
4528 SDValue Undef = DAG.getUNDEF(WideOpVT);
4529
4530 if (IdxVal == 0) {
4531 // Zero lower bits of the Vec
4532 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4533 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4534 ZeroIdx);
4535 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4536 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4537 // Merge them together, SubVec should be zero extended.
4538 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4539 DAG.getConstant(0, dl, WideOpVT),
4540 SubVec, ZeroIdx);
4541 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4542 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4543 }
4544
4545 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4546 Undef, SubVec, ZeroIdx);
4547
4548 if (Vec.isUndef()) {
4549 assert(IdxVal != 0 && "Unexpected index");
4550 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4551 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4552 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4553 }
4554
4556 assert(IdxVal != 0 && "Unexpected index");
4557 // If upper elements of Vec are known undef, then just shift into place.
4558 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4559 [](SDValue V) { return V.isUndef(); })) {
4560 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4561 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4562 } else {
4563 NumElems = WideOpVT.getVectorNumElements();
4564 unsigned ShiftLeft = NumElems - SubVecNumElems;
4565 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4566 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4567 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4568 if (ShiftRight != 0)
4569 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4570 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4571 }
4572 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4573 }
4574
4575 // Simple case when we put subvector in the upper part
4576 if (IdxVal + SubVecNumElems == NumElems) {
4577 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4578 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4579 if (SubVecNumElems * 2 == NumElems) {
4580 // Special case, use legal zero extending insert_subvector. This allows
4581 // isel to optimize when bits are known zero.
4582 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4583 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4584 DAG.getConstant(0, dl, WideOpVT),
4585 Vec, ZeroIdx);
4586 } else {
4587 // Otherwise use explicit shifts to zero the bits.
4588 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4589 Undef, Vec, ZeroIdx);
4590 NumElems = WideOpVT.getVectorNumElements();
4591 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4592 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4593 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4594 }
4595 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4596 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4597 }
4598
4599 // Inserting into the middle is more complicated.
4600
4601 NumElems = WideOpVT.getVectorNumElements();
4602
4603 // Widen the vector if needed.
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4605
4606 unsigned ShiftLeft = NumElems - SubVecNumElems;
4607 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4608
4609 // Do an optimization for the most frequently used types.
4610 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4611 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4612 Mask0.flipAllBits();
4613 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4614 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4615 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4616 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4617 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4618 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4619 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4620 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4621
4622 // Reduce to original width if needed.
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4624 }
4625
4626 // Clear the upper bits of the subvector and move it to its insert position.
4627 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4628 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4629 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4630 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4631
4632 // Isolate the bits below the insertion point.
4633 unsigned LowShift = NumElems - IdxVal;
4634 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4635 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4636 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4637 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4638
4639 // Isolate the bits after the last inserted bit.
4640 unsigned HighShift = IdxVal + SubVecNumElems;
4641 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4642 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4643 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4644 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4645
4646 // Now OR all 3 pieces together.
4647 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4648 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4649
4650 // Reduce to original width if needed.
4651 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4652}
4653
4655 const SDLoc &dl) {
4656 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4657 EVT SubVT = V1.getValueType();
4658 EVT SubSVT = SubVT.getScalarType();
4659 unsigned SubNumElts = SubVT.getVectorNumElements();
4660 unsigned SubVectorWidth = SubVT.getSizeInBits();
4661 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4662 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4663 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4664}
4665
4666/// Returns a vector of specified type with all bits set.
4667/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4668/// Then bitcast to their original type, ensuring they get CSE'd.
4669static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4670 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4671 "Expected a 128/256/512-bit vector type");
4672 unsigned NumElts = VT.getSizeInBits() / 32;
4673 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4674 return DAG.getBitcast(VT, Vec);
4675}
4676
4677static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4678 SDValue In, SelectionDAG &DAG) {
4679 EVT InVT = In.getValueType();
4680 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4681 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4682 ISD::ZERO_EXTEND == Opcode) &&
4683 "Unknown extension opcode");
4684
4685 // For 256-bit vectors, we only need the lower (128-bit) input half.
4686 // For 512-bit vectors, we only need the lower input half or quarter.
4687 if (InVT.getSizeInBits() > 128) {
4688 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4689 "Expected VTs to be the same size!");
4690 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4691 In = extractSubVector(In, 0, DAG, DL,
4692 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4693 InVT = In.getValueType();
4694 }
4695
4696 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4697 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4698
4699 return DAG.getNode(Opcode, DL, VT, In);
4700}
4701
4702// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4703static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4704 SDValue Mask, SelectionDAG &DAG) {
4705 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4706 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4707 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4708}
4709
4711 bool Lo, bool Unary) {
4712 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4713 "Illegal vector type to unpack");
4714 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4715 int NumElts = VT.getVectorNumElements();
4716 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4717 for (int i = 0; i < NumElts; ++i) {
4718 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4719 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4720 Pos += (Unary ? 0 : NumElts * (i % 2));
4721 Pos += (Lo ? 0 : NumEltsInLane / 2);
4722 Mask.push_back(Pos);
4723 }
4724}
4725
4726/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4727/// imposed by AVX and specific to the unary pattern. Example:
4728/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4729/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4731 bool Lo) {
4732 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4733 int NumElts = VT.getVectorNumElements();
4734 for (int i = 0; i < NumElts; ++i) {
4735 int Pos = i / 2;
4736 Pos += (Lo ? 0 : NumElts / 2);
4737 Mask.push_back(Pos);
4738 }
4739}
4740
4741// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4742static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4743 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4745 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4746 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4747 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4748 int M = Mask[I];
4749 if (M < 0)
4750 continue;
4751 SDValue V = (M < NumElts) ? V1 : V2;
4752 if (V.isUndef())
4753 continue;
4754 Ops[I] = V.getOperand(M % NumElts);
4755 }
4756 return DAG.getBuildVector(VT, dl, Ops);
4757 }
4758
4759 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4760}
4761
4762/// Returns a vector_shuffle node for an unpackl operation.
4763static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4764 SDValue V1, SDValue V2) {
4766 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4767 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4768}
4769
4770/// Returns a vector_shuffle node for an unpackh operation.
4771static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4772 SDValue V1, SDValue V2) {
4774 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4775 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4776}
4777
4778/// Returns a node that packs the LHS + RHS nodes together at half width.
4779/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4780/// TODO: Add subvector splitting if/when we have a need for it.
4781static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4782 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4783 bool PackHiHalf = false) {
4784 MVT OpVT = LHS.getSimpleValueType();
4785 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4786 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4787 assert(OpVT == RHS.getSimpleValueType() &&
4788 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4789 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4790 "Unexpected PACK operand types");
4791 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4792 "Unexpected PACK result type");
4793
4794 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4795 if (EltSizeInBits == 32) {
4796 SmallVector<int> PackMask;
4797 int Offset = PackHiHalf ? 1 : 0;
4798 int NumElts = VT.getVectorNumElements();
4799 for (int I = 0; I != NumElts; I += 4) {
4800 PackMask.push_back(I + Offset);
4801 PackMask.push_back(I + Offset + 2);
4802 PackMask.push_back(I + Offset + NumElts);
4803 PackMask.push_back(I + Offset + NumElts + 2);
4804 }
4805 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4806 DAG.getBitcast(VT, RHS), PackMask);
4807 }
4808
4809 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4810 if (!PackHiHalf) {
4811 if (UsePackUS &&
4812 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4813 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4814 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4815
4816 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4817 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4818 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4819 }
4820
4821 // Fallback to sign/zero extending the requested half and pack.
4822 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4823 if (UsePackUS) {
4824 if (PackHiHalf) {
4825 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4826 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4827 } else {
4828 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4829 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4830 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4831 };
4832 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4833 };
4834
4835 if (!PackHiHalf) {
4836 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4837 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4838 }
4839 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4840 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4841 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4842}
4843
4844/// Return a vector_shuffle of the specified vector of zero or undef vector.
4845/// This produces a shuffle where the low element of V2 is swizzled into the
4846/// zero/undef vector, landing at element Idx.
4847/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4849 bool IsZero,
4850 const X86Subtarget &Subtarget,
4851 SelectionDAG &DAG) {
4852 MVT VT = V2.getSimpleValueType();
4853 SDValue V1 = IsZero
4854 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4855 int NumElems = VT.getVectorNumElements();
4856 SmallVector<int, 16> MaskVec(NumElems);
4857 for (int i = 0; i != NumElems; ++i)
4858 // If this is the insertion idx, put the low elt of V2 here.
4859 MaskVec[i] = (i == Idx) ? NumElems : i;
4860 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4861}
4862
4864 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4865 Ptr.getOpcode() == X86ISD::WrapperRIP)
4866 Ptr = Ptr.getOperand(0);
4867 return dyn_cast<ConstantPoolSDNode>(Ptr);
4868}
4869
4870// TODO: Add support for non-zero offsets.
4873 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4874 return nullptr;
4875 return CNode->getConstVal();
4876}
4877
4879 if (!Load || !ISD::isNormalLoad(Load))
4880 return nullptr;
4881 return getTargetConstantFromBasePtr(Load->getBasePtr());
4882}
4883
4886 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4887}
4888
4889const Constant *
4891 assert(LD && "Unexpected null LoadSDNode");
4892 return getTargetConstantFromNode(LD);
4893}
4894
4895// Extract raw constant bits from constant pools.
4896static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4897 APInt &UndefElts,
4898 SmallVectorImpl<APInt> &EltBits,
4899 bool AllowWholeUndefs = true,
4900 bool AllowPartialUndefs = false) {
4901 assert(EltBits.empty() && "Expected an empty EltBits vector");
4902
4904
4905 EVT VT = Op.getValueType();
4906 unsigned SizeInBits = VT.getSizeInBits();
4907 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4908 unsigned NumElts = SizeInBits / EltSizeInBits;
4909
4910 // Bitcast a source array of element bits to the target size.
4911 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4912 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4913 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4914 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4915 "Constant bit sizes don't match");
4916
4917 // Don't split if we don't allow undef bits.
4918 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4919 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4920 return false;
4921
4922 // If we're already the right size, don't bother bitcasting.
4923 if (NumSrcElts == NumElts) {
4924 UndefElts = UndefSrcElts;
4925 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4926 return true;
4927 }
4928
4929 // Extract all the undef/constant element data and pack into single bitsets.
4930 APInt UndefBits(SizeInBits, 0);
4931 APInt MaskBits(SizeInBits, 0);
4932
4933 for (unsigned i = 0; i != NumSrcElts; ++i) {
4934 unsigned BitOffset = i * SrcEltSizeInBits;
4935 if (UndefSrcElts[i])
4936 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4937 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4938 }
4939
4940 // Split the undef/constant single bitset data into the target elements.
4941 UndefElts = APInt(NumElts, 0);
4942 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4943
4944 for (unsigned i = 0; i != NumElts; ++i) {
4945 unsigned BitOffset = i * EltSizeInBits;
4946 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4947
4948 // Only treat an element as UNDEF if all bits are UNDEF.
4949 if (UndefEltBits.isAllOnes()) {
4950 if (!AllowWholeUndefs)
4951 return false;
4952 UndefElts.setBit(i);
4953 continue;
4954 }
4955
4956 // If only some bits are UNDEF then treat them as zero (or bail if not
4957 // supported).
4958 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4959 return false;
4960
4961 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4962 }
4963 return true;
4964 };
4965
4966 // Collect constant bits and insert into mask/undef bit masks.
4967 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4968 unsigned UndefBitIndex) {
4969 if (!Cst)
4970 return false;
4971 if (isa<UndefValue>(Cst)) {
4972 Undefs.setBit(UndefBitIndex);
4973 return true;
4974 }
4975 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4976 Mask = CInt->getValue();
4977 return true;
4978 }
4979 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4980 Mask = CFP->getValueAPF().bitcastToAPInt();
4981 return true;
4982 }
4983 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4984 Type *Ty = CDS->getType();
4986 Type *EltTy = CDS->getElementType();
4987 bool IsInteger = EltTy->isIntegerTy();
4988 bool IsFP =
4989 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4990 if (!IsInteger && !IsFP)
4991 return false;
4992 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4993 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4994 if (IsInteger)
4995 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4996 else
4997 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4998 I * EltBits);
4999 return true;
5000 }
5001 return false;
5002 };
5003
5004 // Handle UNDEFs.
5005 if (Op.isUndef()) {
5006 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5007 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5008 return CastBitData(UndefSrcElts, SrcEltBits);
5009 }
5010
5011 // Extract scalar constant bits.
5012 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5013 APInt UndefSrcElts = APInt::getZero(1);
5014 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5015 return CastBitData(UndefSrcElts, SrcEltBits);
5016 }
5017 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5018 APInt UndefSrcElts = APInt::getZero(1);
5019 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5020 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5021 return CastBitData(UndefSrcElts, SrcEltBits);
5022 }
5023
5024 // Extract constant bits from build vector.
5025 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5026 BitVector Undefs;
5027 SmallVector<APInt> SrcEltBits;
5028 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5029 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5030 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5031 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5032 if (Undefs[I])
5033 UndefSrcElts.setBit(I);
5034 return CastBitData(UndefSrcElts, SrcEltBits);
5035 }
5036 }
5037
5038 // Extract constant bits from constant pool vector.
5039 if (auto *Cst = getTargetConstantFromNode(Op)) {
5040 Type *CstTy = Cst->getType();
5041 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5042 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5043 return false;
5044
5045 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5046 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5047 if ((SizeInBits % SrcEltSizeInBits) != 0)
5048 return false;
5049
5050 APInt UndefSrcElts(NumSrcElts, 0);
5051 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5052 for (unsigned i = 0; i != NumSrcElts; ++i)
5053 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5054 UndefSrcElts, i))
5055 return false;
5056
5057 return CastBitData(UndefSrcElts, SrcEltBits);
5058 }
5059
5060 // Extract constant bits from a broadcasted constant pool scalar.
5061 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5062 EltSizeInBits <= VT.getScalarSizeInBits()) {
5063 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5064 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5065 return false;
5066
5067 SDValue Ptr = MemIntr->getBasePtr();
5069 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5070 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5071
5072 APInt UndefSrcElts(NumSrcElts, 0);
5073 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5074 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5075 if (UndefSrcElts[0])
5076 UndefSrcElts.setBits(0, NumSrcElts);
5077 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5078 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5079 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5080 return CastBitData(UndefSrcElts, SrcEltBits);
5081 }
5082 }
5083 }
5084
5085 // Extract constant bits from a subvector broadcast.
5086 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5087 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5088 SDValue Ptr = MemIntr->getBasePtr();
5089 // The source constant may be larger than the subvector broadcast,
5090 // ensure we extract the correct subvector constants.
5091 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5092 Type *CstTy = Cst->getType();
5093 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5094 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5095 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5096 (SizeInBits % SubVecSizeInBits) != 0)
5097 return false;
5098 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5099 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5100 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5101 APInt UndefSubElts(NumSubElts, 0);
5102 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5103 APInt(CstEltSizeInBits, 0));
5104 for (unsigned i = 0; i != NumSubElts; ++i) {
5105 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5106 UndefSubElts, i))
5107 return false;
5108 for (unsigned j = 1; j != NumSubVecs; ++j)
5109 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5110 }
5111 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5112 UndefSubElts);
5113 return CastBitData(UndefSubElts, SubEltBits);
5114 }
5115 }
5116
5117 // Extract a rematerialized scalar constant insertion.
5118 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5119 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5120 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5121 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5122 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5123
5124 APInt UndefSrcElts(NumSrcElts, 0);
5125 SmallVector<APInt, 64> SrcEltBits;
5126 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5127 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5128 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5129 return CastBitData(UndefSrcElts, SrcEltBits);
5130 }
5131
5132 // Insert constant bits from a base and sub vector sources.
5133 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5134 // If bitcasts to larger elements we might lose track of undefs - don't
5135 // allow any to be safe.
5136 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5137 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5138
5139 APInt UndefSrcElts, UndefSubElts;
5140 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5141 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5142 UndefSubElts, EltSubBits,
5143 AllowWholeUndefs && AllowUndefs,
5144 AllowPartialUndefs && AllowUndefs) &&
5145 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5146 UndefSrcElts, EltSrcBits,
5147 AllowWholeUndefs && AllowUndefs,
5148 AllowPartialUndefs && AllowUndefs)) {
5149 unsigned BaseIdx = Op.getConstantOperandVal(2);
5150 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5151 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5152 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5153 return CastBitData(UndefSrcElts, EltSrcBits);
5154 }
5155 }
5156
5157 // Extract constant bits from a subvector's source.
5158 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
5159 // TODO - support extract_subvector through bitcasts.
5160 if (EltSizeInBits != VT.getScalarSizeInBits())
5161 return false;
5162
5163 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5164 UndefElts, EltBits, AllowWholeUndefs,
5165 AllowPartialUndefs)) {
5166 EVT SrcVT = Op.getOperand(0).getValueType();
5167 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5168 unsigned NumSubElts = VT.getVectorNumElements();
5169 unsigned BaseIdx = Op.getConstantOperandVal(1);
5170 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5171 if ((BaseIdx + NumSubElts) != NumSrcElts)
5172 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5173 if (BaseIdx != 0)
5174 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5175 return true;
5176 }
5177 }
5178
5179 // Extract constant bits from shuffle node sources.
5180 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5181 // TODO - support shuffle through bitcasts.
5182 if (EltSizeInBits != VT.getScalarSizeInBits())
5183 return false;
5184
5185 ArrayRef<int> Mask = SVN->getMask();
5186 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5187 llvm::any_of(Mask, [](int M) { return M < 0; }))
5188 return false;
5189
5190 APInt UndefElts0, UndefElts1;
5191 SmallVector<APInt, 32> EltBits0, EltBits1;
5192 if (isAnyInRange(Mask, 0, NumElts) &&
5193 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5194 UndefElts0, EltBits0, AllowWholeUndefs,
5195 AllowPartialUndefs))
5196 return false;
5197 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5198 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5199 UndefElts1, EltBits1, AllowWholeUndefs,
5200 AllowPartialUndefs))
5201 return false;
5202
5203 UndefElts = APInt::getZero(NumElts);
5204 for (int i = 0; i != (int)NumElts; ++i) {
5205 int M = Mask[i];
5206 if (M < 0) {
5207 UndefElts.setBit(i);
5208 EltBits.push_back(APInt::getZero(EltSizeInBits));
5209 } else if (M < (int)NumElts) {
5210 if (UndefElts0[M])
5211 UndefElts.setBit(i);
5212 EltBits.push_back(EltBits0[M]);
5213 } else {
5214 if (UndefElts1[M - NumElts])
5215 UndefElts.setBit(i);
5216 EltBits.push_back(EltBits1[M - NumElts]);
5217 }
5218 }
5219 return true;
5220 }
5221
5222 return false;
5223}
5224
5225namespace llvm {
5226namespace X86 {
5227bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5228 APInt UndefElts;
5229 SmallVector<APInt, 16> EltBits;
5231 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5232 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5233 int SplatIndex = -1;
5234 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5235 if (UndefElts[i])
5236 continue;
5237 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5238 SplatIndex = -1;
5239 break;
5240 }
5241 SplatIndex = i;
5242 }
5243 if (0 <= SplatIndex) {
5244 SplatVal = EltBits[SplatIndex];
5245 return true;
5246 }
5247 }
5248
5249 return false;
5250}
5251} // namespace X86
5252} // namespace llvm
5253
5255 unsigned MaskEltSizeInBits,
5257 APInt &UndefElts) {
5258 // Extract the raw target constant bits.
5259 SmallVector<APInt, 64> EltBits;
5260 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5261 EltBits, /* AllowWholeUndefs */ true,
5262 /* AllowPartialUndefs */ false))
5263 return false;
5264
5265 // Insert the extracted elements into the mask.
5266 for (const APInt &Elt : EltBits)
5267 RawMask.push_back(Elt.getZExtValue());
5268
5269 return true;
5270}
5271
5272static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5273 bool AllowUndefs) {
5274 APInt UndefElts;
5275 SmallVector<APInt, 64> EltBits;
5276 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5277 /*AllowWholeUndefs*/ AllowUndefs,
5278 /*AllowPartialUndefs*/ false))
5279 return false;
5280
5281 bool IsPow2OrUndef = true;
5282 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5283 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5284 return IsPow2OrUndef;
5285}
5286
5287// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5289 // TODO: don't always ignore oneuse constraints.
5290 V = peekThroughBitcasts(V);
5291 EVT VT = V.getValueType();
5292
5293 // Match not(xor X, -1) -> X.
5294 if (V.getOpcode() == ISD::XOR &&
5295 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5296 isAllOnesConstant(V.getOperand(1))))
5297 return V.getOperand(0);
5298
5299 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5300 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5301 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5302 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5303 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5305 V.getOperand(1));
5306 }
5307 }
5308
5309 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5310 if (V.getOpcode() == X86ISD::PCMPGT &&
5311 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5312 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5313 V.getOperand(0).hasOneUse()) {
5314 APInt UndefElts;
5315 SmallVector<APInt> EltBits;
5316 if (getTargetConstantBitsFromNode(V.getOperand(0),
5317 V.getScalarValueSizeInBits(), UndefElts,
5318 EltBits) &&
5319 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5320 // Don't fold min_signed_value -> (min_signed_value - 1)
5321 bool MinSigned = false;
5322 for (APInt &Elt : EltBits) {
5323 MinSigned |= Elt.isMinSignedValue();
5324 Elt -= 1;
5325 }
5326 if (!MinSigned) {
5327 SDLoc DL(V);
5328 MVT VT = V.getSimpleValueType();
5329 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5330 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5331 }
5332 }
5333 }
5334
5335 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5337 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5338 for (SDValue &CatOp : CatOps) {
5339 SDValue NotCat = IsNOT(CatOp, DAG);
5340 if (!NotCat)
5341 return SDValue();
5342 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5343 }
5344 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5345 }
5346
5347 // Match not(or(not(X),not(Y))) -> and(X, Y).
5348 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5349 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5350 // TODO: Handle cases with single NOT operand -> ANDNP
5351 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5352 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5353 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5354 DAG.getBitcast(VT, Op1));
5355 }
5356
5357 return SDValue();
5358}
5359
5360/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5361/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5362/// Note: This ignores saturation, so inputs must be checked first.
5364 bool Unary, unsigned NumStages = 1) {
5365 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5366 unsigned NumElts = VT.getVectorNumElements();
5367 unsigned NumLanes = VT.getSizeInBits() / 128;
5368 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5369 unsigned Offset = Unary ? 0 : NumElts;
5370 unsigned Repetitions = 1u << (NumStages - 1);
5371 unsigned Increment = 1u << NumStages;
5372 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5373
5374 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5375 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5376 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5377 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5378 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5379 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5380 }
5381 }
5382}
5383
5384// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5385static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5386 APInt &DemandedLHS, APInt &DemandedRHS) {
5387 int NumLanes = VT.getSizeInBits() / 128;
5388 int NumElts = DemandedElts.getBitWidth();
5389 int NumInnerElts = NumElts / 2;
5390 int NumEltsPerLane = NumElts / NumLanes;
5391 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5392
5393 DemandedLHS = APInt::getZero(NumInnerElts);
5394 DemandedRHS = APInt::getZero(NumInnerElts);
5395
5396 // Map DemandedElts to the packed operands.
5397 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5398 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5399 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5400 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5401 if (DemandedElts[OuterIdx])
5402 DemandedLHS.setBit(InnerIdx);
5403 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5404 DemandedRHS.setBit(InnerIdx);
5405 }
5406 }
5407}
5408
5409// Split the demanded elts of a HADD/HSUB node between its operands.
5410static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5411 APInt &DemandedLHS, APInt &DemandedRHS) {
5413 DemandedLHS, DemandedRHS);
5414 DemandedLHS |= DemandedLHS << 1;
5415 DemandedRHS |= DemandedRHS << 1;
5416}
5417
5418/// Calculates the shuffle mask corresponding to the target-specific opcode.
5419/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5420/// operands in \p Ops, and returns true.
5421/// Sets \p IsUnary to true if only one source is used. Note that this will set
5422/// IsUnary for shuffles which use a single input multiple times, and in those
5423/// cases it will adjust the mask to only have indices within that single input.
5424/// It is an error to call this with non-empty Mask/Ops vectors.
5425static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5427 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5428 if (!isTargetShuffle(N.getOpcode()))
5429 return false;
5430
5431 MVT VT = N.getSimpleValueType();
5432 unsigned NumElems = VT.getVectorNumElements();
5433 unsigned MaskEltSize = VT.getScalarSizeInBits();
5435 APInt RawUndefs;
5436 uint64_t ImmN;
5437
5438 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5439 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5440
5441 IsUnary = false;
5442 bool IsFakeUnary = false;
5443 switch (N.getOpcode()) {
5444 case X86ISD::BLENDI:
5445 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5446 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5447 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5448 DecodeBLENDMask(NumElems, ImmN, Mask);
5449 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5450 break;
5451 case X86ISD::SHUFP:
5452 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5453 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5454 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5455 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5456 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5457 break;
5458 case X86ISD::INSERTPS:
5459 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5460 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5461 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5462 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5463 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5464 break;
5465 case X86ISD::EXTRQI:
5466 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5467 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5468 isa<ConstantSDNode>(N.getOperand(2))) {
5469 int BitLen = N.getConstantOperandVal(1);
5470 int BitIdx = N.getConstantOperandVal(2);
5471 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5472 IsUnary = true;
5473 }
5474 break;
5475 case X86ISD::INSERTQI:
5476 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5477 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5478 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5479 isa<ConstantSDNode>(N.getOperand(3))) {
5480 int BitLen = N.getConstantOperandVal(2);
5481 int BitIdx = N.getConstantOperandVal(3);
5482 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5483 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5484 }
5485 break;
5486 case X86ISD::UNPCKH:
5487 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5488 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5489 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5490 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5491 break;
5492 case X86ISD::UNPCKL:
5493 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5494 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5495 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5496 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5497 break;
5498 case X86ISD::MOVHLPS:
5499 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5500 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5501 DecodeMOVHLPSMask(NumElems, Mask);
5502 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5503 break;
5504 case X86ISD::MOVLHPS:
5505 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5506 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5507 DecodeMOVLHPSMask(NumElems, Mask);
5508 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5509 break;
5510 case X86ISD::VALIGN:
5511 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5512 "Only 32-bit and 64-bit elements are supported!");
5513 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5514 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5515 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5516 DecodeVALIGNMask(NumElems, ImmN, Mask);
5517 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5518 Ops.push_back(N.getOperand(1));
5519 Ops.push_back(N.getOperand(0));
5520 break;
5521 case X86ISD::PALIGNR:
5522 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5523 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5524 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5525 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5526 DecodePALIGNRMask(NumElems, ImmN, Mask);
5527 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5528 Ops.push_back(N.getOperand(1));
5529 Ops.push_back(N.getOperand(0));
5530 break;
5531 case X86ISD::VSHLDQ:
5532 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5533 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5534 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5535 DecodePSLLDQMask(NumElems, ImmN, Mask);
5536 IsUnary = true;
5537 break;
5538 case X86ISD::VSRLDQ:
5539 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5540 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5541 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5542 DecodePSRLDQMask(NumElems, ImmN, Mask);
5543 IsUnary = true;
5544 break;
5545 case X86ISD::PSHUFD:
5546 case X86ISD::VPERMILPI:
5547 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5548 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5549 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5550 IsUnary = true;
5551 break;
5552 case X86ISD::PSHUFHW:
5553 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5554 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5555 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5556 IsUnary = true;
5557 break;
5558 case X86ISD::PSHUFLW:
5559 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5560 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5561 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5562 IsUnary = true;
5563 break;
5564 case X86ISD::VZEXT_MOVL:
5565 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5566 DecodeZeroMoveLowMask(NumElems, Mask);
5567 IsUnary = true;
5568 break;
5569 case X86ISD::VBROADCAST:
5570 // We only decode broadcasts of same-sized vectors, peeking through to
5571 // extracted subvectors is likely to cause hasOneUse issues with
5572 // SimplifyDemandedBits etc.
5573 if (N.getOperand(0).getValueType() == VT) {
5574 DecodeVectorBroadcast(NumElems, Mask);
5575 IsUnary = true;
5576 break;
5577 }
5578 return false;
5579 case X86ISD::VPERMILPV: {
5580 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5581 IsUnary = true;
5582 SDValue MaskNode = N.getOperand(1);
5583 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5584 RawUndefs)) {
5585 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5586 break;
5587 }
5588 return false;
5589 }
5590 case X86ISD::PSHUFB: {
5591 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5592 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5593 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5594 IsUnary = true;
5595 SDValue MaskNode = N.getOperand(1);
5596 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5597 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5598 break;
5599 }
5600 return false;
5601 }
5602 case X86ISD::VPERMI:
5603 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5604 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5605 DecodeVPERMMask(NumElems, ImmN, Mask);
5606 IsUnary = true;
5607 break;
5608 case X86ISD::MOVSS:
5609 case X86ISD::MOVSD:
5610 case X86ISD::MOVSH:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5614 break;
5615 case X86ISD::VPERM2X128:
5616 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5617 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5618 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5619 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::SHUF128:
5623 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5624 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5625 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5626 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5627 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5628 break;
5629 case X86ISD::MOVSLDUP:
5630 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5631 DecodeMOVSLDUPMask(NumElems, Mask);
5632 IsUnary = true;
5633 break;
5634 case X86ISD::MOVSHDUP:
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 DecodeMOVSHDUPMask(NumElems, Mask);
5637 IsUnary = true;
5638 break;
5639 case X86ISD::MOVDDUP:
5640 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5641 DecodeMOVDDUPMask(NumElems, Mask);
5642 IsUnary = true;
5643 break;
5644 case X86ISD::VPERMIL2: {
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5647 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5648 SDValue MaskNode = N.getOperand(2);
5649 SDValue CtrlNode = N.getOperand(3);
5650 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5651 unsigned CtrlImm = CtrlOp->getZExtValue();
5652 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5653 RawUndefs)) {
5654 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5655 Mask);
5656 break;
5657 }
5658 }
5659 return false;
5660 }
5661 case X86ISD::VPPERM: {
5662 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5663 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5664 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5665 SDValue MaskNode = N.getOperand(2);
5666 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5667 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5668 break;
5669 }
5670 return false;
5671 }
5672 case X86ISD::VPERMV: {
5673 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5674 IsUnary = true;
5675 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5676 Ops.push_back(N.getOperand(1));
5677 SDValue MaskNode = N.getOperand(0);
5678 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5679 RawUndefs)) {
5680 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5681 break;
5682 }
5683 return false;
5684 }
5685 case X86ISD::VPERMV3: {
5686 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5687 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5688 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5689 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5690 Ops.push_back(N.getOperand(0));
5691 Ops.push_back(N.getOperand(2));
5692 SDValue MaskNode = N.getOperand(1);
5693 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5694 RawUndefs)) {
5695 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5696 break;
5697 }
5698 return false;
5699 }
5700 default:
5701 llvm_unreachable("unknown target shuffle node");
5702 }
5703
5704 // Empty mask indicates the decode failed.
5705 if (Mask.empty())
5706 return false;
5707
5708 // Check if we're getting a shuffle mask with zero'd elements.
5709 if (!AllowSentinelZero && isAnyZero(Mask))
5710 return false;
5711
5712 // If we have a fake unary shuffle, the shuffle mask is spread across two
5713 // inputs that are actually the same node. Re-map the mask to always point
5714 // into the first input.
5715 if (IsFakeUnary)
5716 for (int &M : Mask)
5717 if (M >= (int)Mask.size())
5718 M -= Mask.size();
5719
5720 // If we didn't already add operands in the opcode-specific code, default to
5721 // adding 1 or 2 operands starting at 0.
5722 if (Ops.empty()) {
5723 Ops.push_back(N.getOperand(0));
5724 if (!IsUnary || IsFakeUnary)
5725 Ops.push_back(N.getOperand(1));
5726 }
5727
5728 return true;
5729}
5730
5731// Wrapper for getTargetShuffleMask with InUnary;
5732static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5734 SmallVectorImpl<int> &Mask) {
5735 bool IsUnary;
5736 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5737}
5738
5739/// Compute whether each element of a shuffle is zeroable.
5740///
5741/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5742/// Either it is an undef element in the shuffle mask, the element of the input
5743/// referenced is undef, or the element of the input referenced is known to be
5744/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5745/// as many lanes with this technique as possible to simplify the remaining
5746/// shuffle.
5748 SDValue V1, SDValue V2,
5749 APInt &KnownUndef, APInt &KnownZero) {
5750 int Size = Mask.size();
5751 KnownUndef = KnownZero = APInt::getZero(Size);
5752
5753 V1 = peekThroughBitcasts(V1);
5754 V2 = peekThroughBitcasts(V2);
5755
5756 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5757 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5758
5759 int VectorSizeInBits = V1.getValueSizeInBits();
5760 int ScalarSizeInBits = VectorSizeInBits / Size;
5761 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5762
5763 for (int i = 0; i < Size; ++i) {
5764 int M = Mask[i];
5765 // Handle the easy cases.
5766 if (M < 0) {
5767 KnownUndef.setBit(i);
5768 continue;
5769 }
5770 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5771 KnownZero.setBit(i);
5772 continue;
5773 }
5774
5775 // Determine shuffle input and normalize the mask.
5776 SDValue V = M < Size ? V1 : V2;
5777 M %= Size;
5778
5779 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5780 if (V.getOpcode() != ISD::BUILD_VECTOR)
5781 continue;
5782
5783 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5784 // the (larger) source element must be UNDEF/ZERO.
5785 if ((Size % V.getNumOperands()) == 0) {
5786 int Scale = Size / V->getNumOperands();
5787 SDValue Op = V.getOperand(M / Scale);
5788 if (Op.isUndef())
5789 KnownUndef.setBit(i);
5790 if (X86::isZeroNode(Op))
5791 KnownZero.setBit(i);
5792 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5793 APInt Val = Cst->getAPIntValue();
5794 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5795 if (Val == 0)
5796 KnownZero.setBit(i);
5797 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5798 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5799 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5800 if (Val == 0)
5801 KnownZero.setBit(i);
5802 }
5803 continue;
5804 }
5805
5806 // If the BUILD_VECTOR has more elements then all the (smaller) source
5807 // elements must be UNDEF or ZERO.
5808 if ((V.getNumOperands() % Size) == 0) {
5809 int Scale = V->getNumOperands() / Size;
5810 bool AllUndef = true;
5811 bool AllZero = true;
5812 for (int j = 0; j < Scale; ++j) {
5813 SDValue Op = V.getOperand((M * Scale) + j);
5814 AllUndef &= Op.isUndef();
5815 AllZero &= X86::isZeroNode(Op);
5816 }
5817 if (AllUndef)
5818 KnownUndef.setBit(i);
5819 if (AllZero)
5820 KnownZero.setBit(i);
5821 continue;
5822 }
5823 }
5824}
5825
5826/// Decode a target shuffle mask and inputs and see if any values are
5827/// known to be undef or zero from their inputs.
5828/// Returns true if the target shuffle mask was decoded.
5829/// FIXME: Merge this with computeZeroableShuffleElements?
5832 APInt &KnownUndef, APInt &KnownZero) {
5833 bool IsUnary;
5834 if (!isTargetShuffle(N.getOpcode()))
5835 return false;
5836
5837 MVT VT = N.getSimpleValueType();
5838 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5839 return false;
5840
5841 int Size = Mask.size();
5842 SDValue V1 = Ops[0];
5843 SDValue V2 = IsUnary ? V1 : Ops[1];
5844 KnownUndef = KnownZero = APInt::getZero(Size);
5845
5846 V1 = peekThroughBitcasts(V1);
5847 V2 = peekThroughBitcasts(V2);
5848
5849 assert((VT.getSizeInBits() % Size) == 0 &&
5850 "Illegal split of shuffle value type");
5851 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5852
5853 // Extract known constant input data.
5854 APInt UndefSrcElts[2];
5855 SmallVector<APInt, 32> SrcEltBits[2];
5856 bool IsSrcConstant[2] = {
5857 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5858 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5859 /*AllowPartialUndefs*/ false),
5860 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5861 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5862 /*AllowPartialUndefs*/ false)};
5863
5864 for (int i = 0; i < Size; ++i) {
5865 int M = Mask[i];
5866
5867 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5868 if (M < 0) {
5869 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5870 if (SM_SentinelUndef == M)
5871 KnownUndef.setBit(i);
5872 if (SM_SentinelZero == M)
5873 KnownZero.setBit(i);
5874 continue;
5875 }
5876
5877 // Determine shuffle input and normalize the mask.
5878 unsigned SrcIdx = M / Size;
5879 SDValue V = M < Size ? V1 : V2;
5880 M %= Size;
5881
5882 // We are referencing an UNDEF input.
5883 if (V.isUndef()) {
5884 KnownUndef.setBit(i);
5885 continue;
5886 }
5887
5888 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5889 // TODO: We currently only set UNDEF for integer types - floats use the same
5890 // registers as vectors and many of the scalar folded loads rely on the
5891 // SCALAR_TO_VECTOR pattern.
5892 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5893 (Size % V.getValueType().getVectorNumElements()) == 0) {
5894 int Scale = Size / V.getValueType().getVectorNumElements();
5895 int Idx = M / Scale;
5896 if (Idx != 0 && !VT.isFloatingPoint())
5897 KnownUndef.setBit(i);
5898 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5899 KnownZero.setBit(i);
5900 continue;
5901 }
5902
5903 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5904 // base vectors.
5905 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5906 SDValue Vec = V.getOperand(0);
5907 int NumVecElts = Vec.getValueType().getVectorNumElements();
5908 if (Vec.isUndef() && Size == NumVecElts) {
5909 int Idx = V.getConstantOperandVal(2);
5910 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5911 if (M < Idx || (Idx + NumSubElts) <= M)
5912 KnownUndef.setBit(i);
5913 }
5914 continue;
5915 }
5916
5917 // Attempt to extract from the source's constant bits.
5918 if (IsSrcConstant[SrcIdx]) {
5919 if (UndefSrcElts[SrcIdx][M])
5920 KnownUndef.setBit(i);
5921 else if (SrcEltBits[SrcIdx][M] == 0)
5922 KnownZero.setBit(i);
5923 }
5924 }
5925
5926 assert(VT.getVectorNumElements() == (unsigned)Size &&
5927 "Different mask size from vector size!");
5928 return true;
5929}
5930
5931// Replace target shuffle mask elements with known undef/zero sentinels.
5933 const APInt &KnownUndef,
5934 const APInt &KnownZero,
5935 bool ResolveKnownZeros= true) {
5936 unsigned NumElts = Mask.size();
5937 assert(KnownUndef.getBitWidth() == NumElts &&
5938 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5939
5940 for (unsigned i = 0; i != NumElts; ++i) {
5941 if (KnownUndef[i])
5942 Mask[i] = SM_SentinelUndef;
5943 else if (ResolveKnownZeros && KnownZero[i])
5944 Mask[i] = SM_SentinelZero;
5945 }
5946}
5947
5948// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5950 APInt &KnownUndef,
5951 APInt &KnownZero) {
5952 unsigned NumElts = Mask.size();
5953 KnownUndef = KnownZero = APInt::getZero(NumElts);
5954
5955 for (unsigned i = 0; i != NumElts; ++i) {
5956 int M = Mask[i];
5957 if (SM_SentinelUndef == M)
5958 KnownUndef.setBit(i);
5959 if (SM_SentinelZero == M)
5960 KnownZero.setBit(i);
5961 }
5962}
5963
5964// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5966 SDValue Cond, bool IsBLENDV = false) {
5967 EVT CondVT = Cond.getValueType();
5968 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5969 unsigned NumElts = CondVT.getVectorNumElements();
5970
5971 APInt UndefElts;
5972 SmallVector<APInt, 32> EltBits;
5973 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5974 /*AllowWholeUndefs*/ true,
5975 /*AllowPartialUndefs*/ false))
5976 return false;
5977
5978 Mask.resize(NumElts, SM_SentinelUndef);
5979
5980 for (int i = 0; i != (int)NumElts; ++i) {
5981 Mask[i] = i;
5982 // Arbitrarily choose from the 2nd operand if the select condition element
5983 // is undef.
5984 // TODO: Can we do better by matching patterns such as even/odd?
5985 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5986 (IsBLENDV && EltBits[i].isNonNegative()))
5987 Mask[i] += NumElts;
5988 }
5989
5990 return true;
5991}
5992
5993// Forward declaration (for getFauxShuffleMask recursive check).
5994static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5997 const SelectionDAG &DAG, unsigned Depth,
5998 bool ResolveKnownElts);
5999
6000// Attempt to decode ops that could be represented as a shuffle mask.
6001// The decoded shuffle mask may contain a different number of elements to the
6002// destination value type.
6003// TODO: Merge into getTargetShuffleInputs()
6004static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6007 const SelectionDAG &DAG, unsigned Depth,
6008 bool ResolveKnownElts) {
6009 Mask.clear();
6010 Ops.clear();
6011
6012 MVT VT = N.getSimpleValueType();
6013 unsigned NumElts = VT.getVectorNumElements();
6014 unsigned NumSizeInBits = VT.getSizeInBits();
6015 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6016 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6017 return false;
6018 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6019 unsigned NumSizeInBytes = NumSizeInBits / 8;
6020 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6021
6022 unsigned Opcode = N.getOpcode();
6023 switch (Opcode) {
6024 case ISD::VECTOR_SHUFFLE: {
6025 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6026 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6027 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6028 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6029 Ops.push_back(N.getOperand(0));
6030 Ops.push_back(N.getOperand(1));
6031 return true;
6032 }
6033 return false;
6034 }
6035 case ISD::AND:
6036 case X86ISD::ANDNP: {
6037 // Attempt to decode as a per-byte mask.
6038 APInt UndefElts;
6039 SmallVector<APInt, 32> EltBits;
6040 SDValue N0 = N.getOperand(0);
6041 SDValue N1 = N.getOperand(1);
6042 bool IsAndN = (X86ISD::ANDNP == Opcode);
6043 uint64_t ZeroMask = IsAndN ? 255 : 0;
6044 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6045 /*AllowWholeUndefs*/ false,
6046 /*AllowPartialUndefs*/ false))
6047 return false;
6048 // We can't assume an undef src element gives an undef dst - the other src
6049 // might be zero.
6050 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6051 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6052 const APInt &ByteBits = EltBits[i];
6053 if (ByteBits != 0 && ByteBits != 255)
6054 return false;
6055 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6056 }
6057 Ops.push_back(IsAndN ? N1 : N0);
6058 return true;
6059 }
6060 case ISD::OR: {
6061 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6062 // is a valid shuffle index.
6063 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6064 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6065 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6066 return false;
6067
6068 SmallVector<int, 64> SrcMask0, SrcMask1;
6069 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6072 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6073 Depth + 1, true) ||
6074 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6075 Depth + 1, true))
6076 return false;
6077
6078 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6079 SmallVector<int, 64> Mask0, Mask1;
6080 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6081 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6082 for (int i = 0; i != (int)MaskSize; ++i) {
6083 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6084 // loops converting between OR and BLEND shuffles due to
6085 // canWidenShuffleElements merging away undef elements, meaning we
6086 // fail to recognise the OR as the undef element isn't known zero.
6087 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6088 Mask.push_back(SM_SentinelZero);
6089 else if (Mask1[i] == SM_SentinelZero)
6090 Mask.push_back(i);
6091 else if (Mask0[i] == SM_SentinelZero)
6092 Mask.push_back(i + MaskSize);
6093 else
6094 return false;
6095 }
6096 Ops.push_back(N0);
6097 Ops.push_back(N1);
6098 return true;
6099 }
6100 case ISD::INSERT_SUBVECTOR: {
6101 SDValue Src = N.getOperand(0);
6102 SDValue Sub = N.getOperand(1);
6103 EVT SubVT = Sub.getValueType();
6104 unsigned NumSubElts = SubVT.getVectorNumElements();
6105 if (!N->isOnlyUserOf(Sub.getNode()))
6106 return false;
6107 SDValue SubBC = peekThroughBitcasts(Sub);
6108 uint64_t InsertIdx = N.getConstantOperandVal(2);
6109 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6110 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6111 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6112 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
6113 SDValue SubBCSrc = SubBC.getOperand(0);
6114 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
6115 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
6116 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
6117 "Subvector valuetype mismatch");
6118 InsertIdx *= (MaxElts / NumElts);
6119 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
6120 NumSubElts *= (MaxElts / NumElts);
6121 bool SrcIsUndef = Src.isUndef();
6122 for (int i = 0; i != (int)MaxElts; ++i)
6123 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6124 for (int i = 0; i != (int)NumSubElts; ++i)
6125 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6126 if (!SrcIsUndef)
6127 Ops.push_back(Src);
6128 Ops.push_back(SubBCSrc);
6129 return true;
6130 }
6131 // Handle CONCAT(SUB0, SUB1).
6132 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6133 // cross lane shuffles.
6134 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6135 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
6136 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6137 Src.getOperand(0).isUndef() &&
6138 Src.getOperand(1).getValueType() == SubVT &&
6139 Src.getConstantOperandVal(2) == 0) {
6140 for (int i = 0; i != (int)NumSubElts; ++i)
6141 Mask.push_back(i);
6142 for (int i = 0; i != (int)NumSubElts; ++i)
6143 Mask.push_back(i + NumElts);
6144 Ops.push_back(Src.getOperand(1));
6145 Ops.push_back(Sub);
6146 return true;
6147 }
6148 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6149 SmallVector<int, 64> SubMask;
6150 SmallVector<SDValue, 2> SubInputs;
6151 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
6152 EVT SubSrcVT = SubSrc.getValueType();
6153 if (!SubSrcVT.isVector())
6154 return false;
6155
6156 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6157 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6158 Depth + 1, ResolveKnownElts))
6159 return false;
6160
6161 // Subvector shuffle inputs must not be larger than the subvector.
6162 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6163 return SubVT.getFixedSizeInBits() <
6164 SubInput.getValueSizeInBits().getFixedValue();
6165 }))
6166 return false;
6167
6168 if (SubMask.size() != NumSubElts) {
6169 assert(((SubMask.size() % NumSubElts) == 0 ||
6170 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
6171 if ((NumSubElts % SubMask.size()) == 0) {
6172 int Scale = NumSubElts / SubMask.size();
6173 SmallVector<int,64> ScaledSubMask;
6174 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6175 SubMask = ScaledSubMask;
6176 } else {
6177 int Scale = SubMask.size() / NumSubElts;
6178 NumSubElts = SubMask.size();
6179 NumElts *= Scale;
6180 InsertIdx *= Scale;
6181 }
6182 }
6183 Ops.push_back(Src);
6184 Ops.append(SubInputs.begin(), SubInputs.end());
6185 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6186 Mask.append(NumElts, SM_SentinelZero);
6187 else
6188 for (int i = 0; i != (int)NumElts; ++i)
6189 Mask.push_back(i);
6190 for (int i = 0; i != (int)NumSubElts; ++i) {
6191 int M = SubMask[i];
6192 if (0 <= M) {
6193 int InputIdx = M / NumSubElts;
6194 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6195 }
6196 Mask[i + InsertIdx] = M;
6197 }
6198 return true;
6199 }
6200 case X86ISD::PINSRB:
6201 case X86ISD::PINSRW:
6204 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6205 // vector, for matching src/dst vector types.
6206 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6207
6208 unsigned DstIdx = 0;
6209 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6210 // Check we have an in-range constant insertion index.
6211 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6212 N.getConstantOperandAPInt(2).uge(NumElts))
6213 return false;
6214 DstIdx = N.getConstantOperandVal(2);
6215
6216 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6217 if (X86::isZeroNode(Scl)) {
6218 Ops.push_back(N.getOperand(0));
6219 for (unsigned i = 0; i != NumElts; ++i)
6220 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6221 return true;
6222 }
6223 }
6224
6225 // Peek through trunc/aext/zext/bitcast.
6226 // TODO: aext shouldn't require SM_SentinelZero padding.
6227 // TODO: handle shift of scalars.
6228 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6229 while (Scl.getOpcode() == ISD::TRUNCATE ||
6230 Scl.getOpcode() == ISD::ANY_EXTEND ||
6231 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6232 (Scl.getOpcode() == ISD::BITCAST &&
6235 Scl = Scl.getOperand(0);
6236 MinBitsPerElt =
6237 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6238 }
6239 if ((MinBitsPerElt % 8) != 0)
6240 return false;
6241
6242 // Attempt to find the source vector the scalar was extracted from.
6243 SDValue SrcExtract;
6244 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6245 Scl.getOpcode() == X86ISD::PEXTRW ||
6246 Scl.getOpcode() == X86ISD::PEXTRB) &&
6247 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6248 SrcExtract = Scl;
6249 }
6250 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6251 return false;
6252
6253 SDValue SrcVec = SrcExtract.getOperand(0);
6254 EVT SrcVT = SrcVec.getValueType();
6255 if (!SrcVT.getScalarType().isByteSized())
6256 return false;
6257 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6258 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6259 unsigned DstByte = DstIdx * NumBytesPerElt;
6260 MinBitsPerElt =
6261 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6262
6263 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6264 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6265 Ops.push_back(SrcVec);
6266 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6267 } else {
6268 Ops.push_back(SrcVec);
6269 Ops.push_back(N.getOperand(0));
6270 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6271 Mask.push_back(NumSizeInBytes + i);
6272 }
6273
6274 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6275 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6276 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6277 Mask[DstByte + i] = SrcByte + i;
6278 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6279 Mask[DstByte + i] = SM_SentinelZero;
6280 return true;
6281 }
6282 case X86ISD::PACKSS:
6283 case X86ISD::PACKUS: {
6284 SDValue N0 = N.getOperand(0);
6285 SDValue N1 = N.getOperand(1);
6286 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6287 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6288 "Unexpected input value type");
6289
6290 APInt EltsLHS, EltsRHS;
6291 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6292
6293 // If we know input saturation won't happen (or we don't care for particular
6294 // lanes), we can treat this as a truncation shuffle.
6295 bool Offset0 = false, Offset1 = false;
6296 if (Opcode == X86ISD::PACKSS) {
6297 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6298 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6299 (!(N1.isUndef() || EltsRHS.isZero()) &&
6300 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6301 return false;
6302 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6303 // PACKSS then it was likely being used for sign-extension for a
6304 // truncation, so just peek through and adjust the mask accordingly.
6305 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6306 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6307 Offset0 = true;
6308 N0 = N0.getOperand(0);
6309 }
6310 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6311 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6312 Offset1 = true;
6313 N1 = N1.getOperand(0);
6314 }
6315 } else {
6316 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6317 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6318 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6319 (!(N1.isUndef() || EltsRHS.isZero()) &&
6320 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6321 return false;
6322 }
6323
6324 bool IsUnary = (N0 == N1);
6325
6326 Ops.push_back(N0);
6327 if (!IsUnary)
6328 Ops.push_back(N1);
6329
6330 createPackShuffleMask(VT, Mask, IsUnary);
6331
6332 if (Offset0 || Offset1) {
6333 for (int &M : Mask)
6334 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6335 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6336 ++M;
6337 }
6338 return true;
6339 }
6340 case ISD::VSELECT:
6341 case X86ISD::BLENDV: {
6342 SDValue Cond = N.getOperand(0);
6343 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6344 Ops.push_back(N.getOperand(1));
6345 Ops.push_back(N.getOperand(2));
6346 return true;
6347 }
6348 return false;
6349 }
6350 case X86ISD::VTRUNC: {
6351 SDValue Src = N.getOperand(0);
6352 EVT SrcVT = Src.getValueType();
6353 // Truncated source must be a simple vector.
6354 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6355 (SrcVT.getScalarSizeInBits() % 8) != 0)
6356 return false;
6357 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6358 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6359 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6360 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6361 for (unsigned i = 0; i != NumSrcElts; ++i)
6362 Mask.push_back(i * Scale);
6363 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6364 Ops.push_back(Src);
6365 return true;
6366 }
6367 case ISD::SHL:
6368 case ISD::SRL: {
6369 // We can only decode 'whole byte' bit shifts as shuffles.
6370 std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
6371 if (!Amt || (*Amt % 8) != 0)
6372 return false;
6373
6374 uint64_t ByteShift = *Amt / 8;
6375 Ops.push_back(N.getOperand(0));
6376
6377 // Clear mask to all zeros and insert the shifted byte indices.
6378 Mask.append(NumSizeInBytes, SM_SentinelZero);
6379
6380 if (ISD::SHL == Opcode) {
6381 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6382 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6383 Mask[i + j] = i + j - ByteShift;
6384 } else {
6385 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6386 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6387 Mask[i + j - ByteShift] = i + j;
6388 }
6389 return true;
6390 }
6391 case X86ISD::VSHLI:
6392 case X86ISD::VSRLI: {
6393 uint64_t ShiftVal = N.getConstantOperandVal(1);
6394 // Out of range bit shifts are guaranteed to be zero.
6395 if (NumBitsPerElt <= ShiftVal) {
6396 Mask.append(NumElts, SM_SentinelZero);
6397 return true;
6398 }
6399
6400 // We can only decode 'whole byte' bit shifts as shuffles.
6401 if ((ShiftVal % 8) != 0)
6402 break;
6403
6404 uint64_t ByteShift = ShiftVal / 8;
6405 Ops.push_back(N.getOperand(0));
6406
6407 // Clear mask to all zeros and insert the shifted byte indices.
6408 Mask.append(NumSizeInBytes, SM_SentinelZero);
6409
6410 if (X86ISD::VSHLI == Opcode) {
6411 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6412 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6413 Mask[i + j] = i + j - ByteShift;
6414 } else {
6415 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6416 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6417 Mask[i + j - ByteShift] = i + j;
6418 }
6419 return true;
6420 }
6421 case X86ISD::VROTLI:
6422 case X86ISD::VROTRI: {
6423 // We can only decode 'whole byte' bit rotates as shuffles.
6424 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6425 if ((RotateVal % 8) != 0)
6426 return false;
6427 Ops.push_back(N.getOperand(0));
6428 int Offset = RotateVal / 8;
6429 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6430 for (int i = 0; i != (int)NumElts; ++i) {
6431 int BaseIdx = i * NumBytesPerElt;
6432 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6433 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6434 }
6435 }
6436 return true;
6437 }
6438 case X86ISD::VBROADCAST: {
6439 SDValue Src = N.getOperand(0);
6440 if (!Src.getSimpleValueType().isVector()) {
6441 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6442 !isNullConstant(Src.getOperand(1)) ||
6443 Src.getOperand(0).getValueType().getScalarType() !=
6444 VT.getScalarType())
6445 return false;
6446 Src = Src.getOperand(0);
6447 }
6448 Ops.push_back(Src);
6449 Mask.append(NumElts, 0);
6450 return true;
6451 }
6453 SDValue Src = N.getOperand(0);
6454 EVT SrcVT = Src.getValueType();
6455 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6456
6457 // Extended source must be a simple vector.
6458 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6459 (NumBitsPerSrcElt % 8) != 0)
6460 return false;
6461
6462 // We can only handle all-signbits extensions.
6463 APInt DemandedSrcElts =
6464 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6465 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6466 return false;
6467
6468 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6469 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6470 for (unsigned I = 0; I != NumElts; ++I)
6471 Mask.append(Scale, I);
6472 Ops.push_back(Src);
6473 return true;
6474 }
6475 case ISD::ZERO_EXTEND:
6476 case ISD::ANY_EXTEND:
6479 SDValue Src = N.getOperand(0);
6480 EVT SrcVT = Src.getValueType();
6481
6482 // Extended source must be a simple vector.
6483 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6484 (SrcVT.getScalarSizeInBits() % 8) != 0)
6485 return false;
6486
6487 bool IsAnyExtend =
6488 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6489 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6490 IsAnyExtend, Mask);
6491 Ops.push_back(Src);
6492 return true;
6493 }
6494 }
6495
6496 return false;
6497}
6498
6499/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6501 SmallVectorImpl<int> &Mask) {
6502 int MaskWidth = Mask.size();
6503 SmallVector<SDValue, 16> UsedInputs;
6504 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6505 int lo = UsedInputs.size() * MaskWidth;
6506 int hi = lo + MaskWidth;
6507
6508 // Strip UNDEF input usage.
6509 if (Inputs[i].isUndef())
6510 for (int &M : Mask)
6511 if ((lo <= M) && (M < hi))
6512 M = SM_SentinelUndef;
6513
6514 // Check for unused inputs.
6515 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6516 for (int &M : Mask)
6517 if (lo <= M)
6518 M -= MaskWidth;
6519 continue;
6520 }
6521
6522 // Check for repeated inputs.
6523 bool IsRepeat = false;
6524 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6525 if (UsedInputs[j] != Inputs[i])
6526 continue;
6527 for (int &M : Mask)
6528 if (lo <= M)
6529 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6530 IsRepeat = true;
6531 break;
6532 }
6533 if (IsRepeat)
6534 continue;
6535
6536 UsedInputs.push_back(Inputs[i]);
6537 }
6538 Inputs = UsedInputs;
6539}
6540
6541/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6542/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6543/// Returns true if the target shuffle mask was decoded.
6544static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6547 APInt &KnownUndef, APInt &KnownZero,
6548 const SelectionDAG &DAG, unsigned Depth,
6549 bool ResolveKnownElts) {
6551 return false; // Limit search depth.
6552
6553 EVT VT = Op.getValueType();
6554 if (!VT.isSimple() || !VT.isVector())
6555 return false;
6556
6557 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6558 if (ResolveKnownElts)
6559 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6560 return true;
6561 }
6562 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6563 ResolveKnownElts)) {
6564 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6565 return true;
6566 }
6567 return false;
6568}
6569
6570static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6573 const SelectionDAG &DAG, unsigned Depth,
6574 bool ResolveKnownElts) {
6575 APInt KnownUndef, KnownZero;
6576 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6577 KnownZero, DAG, Depth, ResolveKnownElts);
6578}
6579
6582 const SelectionDAG &DAG, unsigned Depth = 0,
6583 bool ResolveKnownElts = true) {
6584 EVT VT = Op.getValueType();
6585 if (!VT.isSimple() || !VT.isVector())
6586 return false;
6587
6588 unsigned NumElts = Op.getValueType().getVectorNumElements();
6589 APInt DemandedElts = APInt::getAllOnes(NumElts);
6590 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6591 ResolveKnownElts);
6592}
6593
6594// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6595static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6596 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6597 SelectionDAG &DAG) {
6598 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6599 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6600 "Unknown broadcast load type");
6601
6602 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6603 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6604 return SDValue();
6605
6608 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6609 SDValue Ops[] = {Mem->getChain(), Ptr};
6610 SDValue BcstLd = DAG.getMemIntrinsicNode(
6611 Opcode, DL, Tys, Ops, MemVT,
6613 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6614 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6615 return BcstLd;
6616}
6617
6618/// Returns the scalar element that will make up the i'th
6619/// element of the result of the vector shuffle.
6620static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6621 SelectionDAG &DAG, unsigned Depth) {
6623 return SDValue(); // Limit search depth.
6624
6625 EVT VT = Op.getValueType();
6626 unsigned Opcode = Op.getOpcode();
6627 unsigned NumElems = VT.getVectorNumElements();
6628
6629 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6630 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6631 int Elt = SV->getMaskElt(Index);
6632
6633 if (Elt < 0)
6634 return DAG.getUNDEF(VT.getVectorElementType());
6635
6636 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6637 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6638 }
6639
6640 // Recurse into target specific vector shuffles to find scalars.
6641 if (isTargetShuffle(Opcode)) {
6642 MVT ShufVT = VT.getSimpleVT();
6643 MVT ShufSVT = ShufVT.getVectorElementType();
6644 int NumElems = (int)ShufVT.getVectorNumElements();
6645 SmallVector<int, 16> ShuffleMask;
6647 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6648 return SDValue();
6649
6650 int Elt = ShuffleMask[Index];
6651 if (Elt == SM_SentinelZero)
6652 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6653 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6654 if (Elt == SM_SentinelUndef)
6655 return DAG.getUNDEF(ShufSVT);
6656
6657 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6658 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6659 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6660 }
6661
6662 // Recurse into insert_subvector base/sub vector to find scalars.
6663 if (Opcode == ISD::INSERT_SUBVECTOR) {
6664 SDValue Vec = Op.getOperand(0);
6665 SDValue Sub = Op.getOperand(1);
6666 uint64_t SubIdx = Op.getConstantOperandVal(2);
6667 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6668
6669 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6670 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6671 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6672 }
6673
6674 // Recurse into concat_vectors sub vector to find scalars.
6675 if (Opcode == ISD::CONCAT_VECTORS) {
6676 EVT SubVT = Op.getOperand(0).getValueType();
6677 unsigned NumSubElts = SubVT.getVectorNumElements();
6678 uint64_t SubIdx = Index / NumSubElts;
6679 uint64_t SubElt = Index % NumSubElts;
6680 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6681 }
6682
6683 // Recurse into extract_subvector src vector to find scalars.
6684 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6685 SDValue Src = Op.getOperand(0);
6686 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6687 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6688 }
6689
6690 // We only peek through bitcasts of the same vector width.
6691 if (Opcode == ISD::BITCAST) {
6692 SDValue Src = Op.getOperand(0);
6693 EVT SrcVT = Src.getValueType();
6694 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6695 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6696 return SDValue();
6697 }
6698
6699 // Actual nodes that may contain scalar elements
6700
6701 // For insert_vector_elt - either return the index matching scalar or recurse
6702 // into the base vector.
6703 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6704 isa<ConstantSDNode>(Op.getOperand(2))) {
6705 if (Op.getConstantOperandAPInt(2) == Index)
6706 return Op.getOperand(1);
6707 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6708 }
6709
6710 if (Opcode == ISD::SCALAR_TO_VECTOR)
6711 return (Index == 0) ? Op.getOperand(0)
6712 : DAG.getUNDEF(VT.getVectorElementType());
6713
6714 if (Opcode == ISD::BUILD_VECTOR)
6715 return Op.getOperand(Index);
6716
6717 return SDValue();
6718}
6719
6720// Use PINSRB/PINSRW/PINSRD to create a build vector.
6722 const APInt &NonZeroMask,
6723 unsigned NumNonZero, unsigned NumZero,
6724 SelectionDAG &DAG,
6725 const X86Subtarget &Subtarget) {
6726 MVT VT = Op.getSimpleValueType();
6727 unsigned NumElts = VT.getVectorNumElements();
6728 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6729 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6730 "Illegal vector insertion");
6731
6732 SDValue V;
6733 bool First = true;
6734
6735 for (unsigned i = 0; i < NumElts; ++i) {
6736 bool IsNonZero = NonZeroMask[i];
6737 if (!IsNonZero)
6738 continue;
6739
6740 // If the build vector contains zeros or our first insertion is not the
6741 // first index then insert into zero vector to break any register
6742 // dependency else use SCALAR_TO_VECTOR.
6743 if (First) {
6744 First = false;
6745 if (NumZero || 0 != i)
6746 V = getZeroVector(VT, Subtarget, DAG, DL);
6747 else {
6748 assert(0 == i && "Expected insertion into zero-index");
6749 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6750 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6751 V = DAG.getBitcast(VT, V);
6752 continue;
6753 }
6754 }
6755 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6756 DAG.getVectorIdxConstant(i, DL));
6757 }
6758
6759 return V;
6760}
6761
6762/// Custom lower build_vector of v16i8.
6764 const APInt &NonZeroMask,
6765 unsigned NumNonZero, unsigned NumZero,
6766 SelectionDAG &DAG,
6767 const X86Subtarget &Subtarget) {
6768 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6769 return SDValue();
6770
6771 // SSE4.1 - use PINSRB to insert each byte directly.
6772 if (Subtarget.hasSSE41())
6773 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6774 DAG, Subtarget);
6775
6776 SDValue V;
6777
6778 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6779 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6780 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6781 !NonZeroMask.extractBits(2, 2).isZero()) {
6782 for (unsigned I = 0; I != 4; ++I) {
6783 if (!NonZeroMask[I])
6784 continue;
6785 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6786 if (I != 0)
6787 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6788 DAG.getConstant(I * 8, DL, MVT::i8));
6789 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6790 }
6791 assert(V && "Failed to fold v16i8 vector to zero");
6792 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6793 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6794 V = DAG.getBitcast(MVT::v8i16, V);
6795 }
6796 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6797 bool ThisIsNonZero = NonZeroMask[i];
6798 bool NextIsNonZero = NonZeroMask[i + 1];
6799 if (!ThisIsNonZero && !NextIsNonZero)
6800 continue;
6801
6802 SDValue Elt;
6803 if (ThisIsNonZero) {
6804 if (NumZero || NextIsNonZero)
6805 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6806 else
6807 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6808 }
6809
6810 if (NextIsNonZero) {
6811 SDValue NextElt = Op.getOperand(i + 1);
6812 if (i == 0 && NumZero)
6813 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6814 else
6815 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6816 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6817 DAG.getConstant(8, DL, MVT::i8));
6818 if (ThisIsNonZero)
6819 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6820 else
6821 Elt = NextElt;
6822 }
6823
6824 // If our first insertion is not the first index or zeros are needed, then
6825 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6826 // elements undefined).
6827 if (!V) {
6828 if (i != 0 || NumZero)
6829 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6830 else {
6831 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6832 V = DAG.getBitcast(MVT::v8i16, V);
6833 continue;
6834 }
6835 }
6836 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6837 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6838 DAG.getVectorIdxConstant(i / 2, DL));
6839 }
6840
6841 return DAG.getBitcast(MVT::v16i8, V);
6842}
6843
6844/// Custom lower build_vector of v8i16.
6846 const APInt &NonZeroMask,
6847 unsigned NumNonZero, unsigned NumZero,
6848 SelectionDAG &DAG,
6849 const X86Subtarget &Subtarget) {
6850 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6851 return SDValue();
6852
6853 // Use PINSRW to insert each byte directly.
6854 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6855 Subtarget);
6856}
6857
6858/// Custom lower build_vector of v4i32 or v4f32.
6860 SelectionDAG &DAG,
6861 const X86Subtarget &Subtarget) {
6862 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6863 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6864 // Because we're creating a less complicated build vector here, we may enable
6865 // further folding of the MOVDDUP via shuffle transforms.
6866 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6867 Op.getOperand(0) == Op.getOperand(2) &&
6868 Op.getOperand(1) == Op.getOperand(3) &&
6869 Op.getOperand(0) != Op.getOperand(1)) {
6870 MVT VT = Op.getSimpleValueType();
6871 MVT EltVT = VT.getVectorElementType();
6872 // Create a new build vector with the first 2 elements followed by undef
6873 // padding, bitcast to v2f64, duplicate, and bitcast back.
6874 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6875 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6876 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6877 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6878 return DAG.getBitcast(VT, Dup);
6879 }
6880
6881 // Find all zeroable elements.
6882 std::bitset<4> Zeroable, Undefs;
6883 for (int i = 0; i < 4; ++i) {
6884 SDValue Elt = Op.getOperand(i);
6885 Undefs[i] = Elt.isUndef();
6886 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6887 }
6888 assert(Zeroable.size() - Zeroable.count() > 1 &&
6889 "We expect at least two non-zero elements!");
6890
6891 // We only know how to deal with build_vector nodes where elements are either
6892 // zeroable or extract_vector_elt with constant index.
6893 SDValue FirstNonZero;
6894 unsigned FirstNonZeroIdx;
6895 for (unsigned i = 0; i < 4; ++i) {
6896 if (Zeroable[i])
6897 continue;
6898 SDValue Elt = Op.getOperand(i);
6899 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6900 !isa<ConstantSDNode>(Elt.getOperand(1)))
6901 return SDValue();
6902 // Make sure that this node is extracting from a 128-bit vector.
6903 MVT VT = Elt.getOperand(0).getSimpleValueType();
6904 if (!VT.is128BitVector())
6905 return SDValue();
6906 if (!FirstNonZero.getNode()) {
6907 FirstNonZero = Elt;
6908 FirstNonZeroIdx = i;
6909 }
6910 }
6911
6912 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6913 SDValue V1 = FirstNonZero.getOperand(0);
6914 MVT VT = V1.getSimpleValueType();
6915
6916 // See if this build_vector can be lowered as a blend with zero.
6917 SDValue Elt;
6918 unsigned EltMaskIdx, EltIdx;
6919 int Mask[4];
6920 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6921 if (Zeroable[EltIdx]) {
6922 // The zero vector will be on the right hand side.
6923 Mask[EltIdx] = EltIdx+4;
6924 continue;
6925 }
6926
6927 Elt = Op->getOperand(EltIdx);
6928 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6929 EltMaskIdx = Elt.getConstantOperandVal(1);
6930 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6931 break;
6932 Mask[EltIdx] = EltIdx;
6933 }
6934
6935 if (EltIdx == 4) {
6936 // Let the shuffle legalizer deal with blend operations.
6937 SDValue VZeroOrUndef = (Zeroable == Undefs)
6938 ? DAG.getUNDEF(VT)
6939 : getZeroVector(VT, Subtarget, DAG, DL);
6940 if (V1.getSimpleValueType() != VT)
6941 V1 = DAG.getBitcast(VT, V1);
6942 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6943 }
6944
6945 // See if we can lower this build_vector to a INSERTPS.
6946 if (!Subtarget.hasSSE41())
6947 return SDValue();
6948
6949 SDValue V2 = Elt.getOperand(0);
6950 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6951 V1 = SDValue();
6952
6953 bool CanFold = true;
6954 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6955 if (Zeroable[i])
6956 continue;
6957
6958 SDValue Current = Op->getOperand(i);
6959 SDValue SrcVector = Current->getOperand(0);
6960 if (!V1.getNode())
6961 V1 = SrcVector;
6962 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6963 }
6964
6965 if (!CanFold)
6966 return SDValue();
6967
6968 assert(V1.getNode() && "Expected at least two non-zero elements!");
6969 if (V1.getSimpleValueType() != MVT::v4f32)
6970 V1 = DAG.getBitcast(MVT::v4f32, V1);
6971 if (V2.getSimpleValueType() != MVT::v4f32)
6972 V2 = DAG.getBitcast(MVT::v4f32, V2);
6973
6974 // Ok, we can emit an INSERTPS instruction.
6975 unsigned ZMask = Zeroable.to_ulong();
6976
6977 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6978 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6979 SDValue Result =
6980 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6981 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
6982 return DAG.getBitcast(VT, Result);
6983}
6984
6985/// Return a vector logical shift node.
6986static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6987 SelectionDAG &DAG, const TargetLowering &TLI,
6988 const SDLoc &dl) {
6989 assert(VT.is128BitVector() && "Unknown type for VShift");
6990 MVT ShVT = MVT::v16i8;
6991 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6992 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6993 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6994 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6995 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6996}
6997
6999 SelectionDAG &DAG) {
7000
7001 // Check if the scalar load can be widened into a vector load. And if
7002 // the address is "base + cst" see if the cst can be "absorbed" into
7003 // the shuffle mask.
7004 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7005 SDValue Ptr = LD->getBasePtr();
7006 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7007 return SDValue();
7008 EVT PVT = LD->getValueType(0);
7009 if (PVT != MVT::i32 && PVT != MVT::f32)
7010 return SDValue();
7011
7012 int FI = -1;
7013 int64_t Offset = 0;
7014 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7015 FI = FINode->getIndex();
7016 Offset = 0;
7017 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7018 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7019 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7020 Offset = Ptr.getConstantOperandVal(1);
7021 Ptr = Ptr.getOperand(0);
7022 } else {
7023 return SDValue();
7024 }
7025
7026 // FIXME: 256-bit vector instructions don't require a strict alignment,
7027 // improve this code to support it better.
7028 Align RequiredAlign(VT.getSizeInBits() / 8);
7029 SDValue Chain = LD->getChain();
7030 // Make sure the stack object alignment is at least 16 or 32.
7032 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7033 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7034 if (MFI.isFixedObjectIndex(FI)) {
7035 // Can't change the alignment. FIXME: It's possible to compute
7036 // the exact stack offset and reference FI + adjust offset instead.
7037 // If someone *really* cares about this. That's the way to implement it.
7038 return SDValue();
7039 } else {
7040 MFI.setObjectAlignment(FI, RequiredAlign);
7041 }
7042 }
7043
7044 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7045 // Ptr + (Offset & ~15).
7046 if (Offset < 0)
7047 return SDValue();
7048 if ((Offset % RequiredAlign.value()) & 3)
7049 return SDValue();
7050 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7051 if (StartOffset) {
7052 SDLoc DL(Ptr);
7053 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7054 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7055 }
7056
7057 int EltNo = (Offset - StartOffset) >> 2;
7058 unsigned NumElems = VT.getVectorNumElements();
7059
7060 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7061 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7062 LD->getPointerInfo().getWithOffset(StartOffset));
7063
7064 SmallVector<int, 8> Mask(NumElems, EltNo);
7065
7066 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7067 }
7068
7069 return SDValue();
7070}
7071
7072// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7073static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7074 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7075 auto *BaseLd = cast<LoadSDNode>(Elt);
7076 if (!BaseLd->isSimple())
7077 return false;
7078 Ld = BaseLd;
7079 ByteOffset = 0;
7080 return true;
7081 }
7082
7083 switch (Elt.getOpcode()) {
7084 case ISD::BITCAST:
7085 case ISD::TRUNCATE:
7087 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7088 case ISD::SRL:
7089 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7090 uint64_t Amt = AmtC->getZExtValue();
7091 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7092 ByteOffset += Amt / 8;
7093 return true;
7094 }
7095 }
7096 break;
7098 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7099 SDValue Src = Elt.getOperand(0);
7100 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7101 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7102 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7103 findEltLoadSrc(Src, Ld, ByteOffset)) {
7104 uint64_t Idx = IdxC->getZExtValue();
7105 ByteOffset += Idx * (SrcSizeInBits / 8);
7106 return true;
7107 }
7108 }
7109 break;
7110 }
7111
7112 return false;
7113}
7114
7115/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7116/// elements can be replaced by a single large load which has the same value as
7117/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7118///
7119/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7121 const SDLoc &DL, SelectionDAG &DAG,
7122 const X86Subtarget &Subtarget,
7123 bool IsAfterLegalize) {
7124 if ((VT.getScalarSizeInBits() % 8) != 0)
7125 return SDValue();
7126
7127 unsigned NumElems = Elts.size();
7128
7129 int LastLoadedElt = -1;
7130 APInt LoadMask = APInt::getZero(NumElems);
7131 APInt ZeroMask = APInt::getZero(NumElems);
7132 APInt UndefMask = APInt::getZero(NumElems);
7133
7134 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7135 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7136
7137 // For each element in the initializer, see if we've found a load, zero or an
7138 // undef.
7139 for (unsigned i = 0; i < NumElems; ++i) {
7140 SDValue Elt = peekThroughBitcasts(Elts[i]);
7141 if (!Elt.getNode())
7142 return SDValue();
7143 if (Elt.isUndef()) {
7144 UndefMask.setBit(i);
7145 continue;
7146 }
7148 ZeroMask.setBit(i);
7149 continue;
7150 }
7151
7152 // Each loaded element must be the correct fractional portion of the
7153 // requested vector load.
7154 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7155 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7156 return SDValue();
7157
7158 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7159 return SDValue();
7160 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7161 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7162 return SDValue();
7163
7164 LoadMask.setBit(i);
7165 LastLoadedElt = i;
7166 }
7167 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7168 NumElems &&
7169 "Incomplete element masks");
7170
7171 // Handle Special Cases - all undef or undef/zero.
7172 if (UndefMask.popcount() == NumElems)
7173 return DAG.getUNDEF(VT);
7174 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7175 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7176 : DAG.getConstantFP(0.0, DL, VT);
7177
7178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7179 int FirstLoadedElt = LoadMask.countr_zero();
7180 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7181 EVT EltBaseVT = EltBase.getValueType();
7182 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7183 "Register/Memory size mismatch");
7184 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7185 assert(LDBase && "Did not find base load for merging consecutive loads");
7186 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7187 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7188 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7189 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7190 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7191
7192 // TODO: Support offsetting the base load.
7193 if (ByteOffsets[FirstLoadedElt] != 0)
7194 return SDValue();
7195
7196 // Check to see if the element's load is consecutive to the base load
7197 // or offset from a previous (already checked) load.
7198 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7199 LoadSDNode *Ld = Loads[EltIdx];
7200 int64_t ByteOffset = ByteOffsets[EltIdx];
7201 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7202 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7203 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7204 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7205 }
7206 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7207 EltIdx - FirstLoadedElt);
7208 };
7209
7210 // Consecutive loads can contain UNDEFS but not ZERO elements.
7211 // Consecutive loads with UNDEFs and ZEROs elements require a
7212 // an additional shuffle stage to clear the ZERO elements.
7213 bool IsConsecutiveLoad = true;
7214 bool IsConsecutiveLoadWithZeros = true;
7215 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7216 if (LoadMask[i]) {
7217 if (!CheckConsecutiveLoad(LDBase, i)) {
7218 IsConsecutiveLoad = false;
7219 IsConsecutiveLoadWithZeros = false;
7220 break;
7221 }
7222 } else if (ZeroMask[i]) {
7223 IsConsecutiveLoad = false;
7224 }
7225 }
7226
7227 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7228 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7229 assert(LDBase->isSimple() &&
7230 "Cannot merge volatile or atomic loads.");
7231 SDValue NewLd =
7232 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7233 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7234 MMOFlags);
7235 for (auto *LD : Loads)
7236 if (LD)
7237 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7238 return NewLd;
7239 };
7240
7241 // Check if the base load is entirely dereferenceable.
7242 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7243 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7244
7245 // LOAD - all consecutive load/undefs (must start/end with a load or be
7246 // entirely dereferenceable). If we have found an entire vector of loads and
7247 // undefs, then return a large load of the entire vector width starting at the
7248 // base pointer. If the vector contains zeros, then attempt to shuffle those
7249 // elements.
7250 if (FirstLoadedElt == 0 &&
7251 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7252 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7253 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7254 return SDValue();
7255
7256 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7257 // will lower to regular temporal loads and use the cache.
7258 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7259 VT.is256BitVector() && !Subtarget.hasInt256())
7260 return SDValue();
7261
7262 if (NumElems == 1)
7263 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7264
7265 if (!ZeroMask)
7266 return CreateLoad(VT, LDBase);
7267
7268 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7269 // vector and a zero vector to clear out the zero elements.
7270 if (!IsAfterLegalize && VT.isVector()) {
7271 unsigned NumMaskElts = VT.getVectorNumElements();
7272 if ((NumMaskElts % NumElems) == 0) {
7273 unsigned Scale = NumMaskElts / NumElems;
7274 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7275 for (unsigned i = 0; i < NumElems; ++i) {
7276 if (UndefMask[i])
7277 continue;
7278 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7279 for (unsigned j = 0; j != Scale; ++j)
7280 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7281 }
7282 SDValue V = CreateLoad(VT, LDBase);
7283 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7284 : DAG.getConstantFP(0.0, DL, VT);
7285 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7286 }
7287 }
7288 }
7289
7290 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7291 if (VT.is256BitVector() || VT.is512BitVector()) {
7292 unsigned HalfNumElems = NumElems / 2;
7293 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7294 EVT HalfVT =
7295 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7296 SDValue HalfLD =
7297 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7298 DAG, Subtarget, IsAfterLegalize);
7299 if (HalfLD)
7300 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7301 HalfLD, DAG.getVectorIdxConstant(0, DL));
7302 }
7303 }
7304
7305 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7306 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7307 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7308 LoadSizeInBits == 64) &&
7309 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7310 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7311 : MVT::getIntegerVT(LoadSizeInBits);
7312 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7313 // Allow v4f32 on SSE1 only targets.
7314 // FIXME: Add more isel patterns so we can just use VT directly.
7315 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7316 VecVT = MVT::v4f32;
7317 if (TLI.isTypeLegal(VecVT)) {
7318 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7319 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7320 SDValue ResNode = DAG.getMemIntrinsicNode(
7321 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7323 for (auto *LD : Loads)
7324 if (LD)
7325 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7326 return DAG.getBitcast(VT, ResNode);
7327 }
7328 }
7329
7330 // BROADCAST - match the smallest possible repetition pattern, load that
7331 // scalar/subvector element and then broadcast to the entire vector.
7332 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7333 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7334 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7335 unsigned RepeatSize = SubElems * BaseSizeInBits;
7336 unsigned ScalarSize = std::min(RepeatSize, 64u);
7337 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7338 continue;
7339
7340 // Don't attempt a 1:N subvector broadcast - it should be caught by
7341 // combineConcatVectorOps, else will cause infinite loops.
7342 if (RepeatSize > ScalarSize && SubElems == 1)
7343 continue;
7344
7345 bool Match = true;
7346 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7347 for (unsigned i = 0; i != NumElems && Match; ++i) {
7348 if (!LoadMask[i])
7349 continue;
7350 SDValue Elt = peekThroughBitcasts(Elts[i]);
7351 if (RepeatedLoads[i % SubElems].isUndef())
7352 RepeatedLoads[i % SubElems] = Elt;
7353 else
7354 Match &= (RepeatedLoads[i % SubElems] == Elt);
7355 }
7356
7357 // We must have loads at both ends of the repetition.
7358 Match &= !RepeatedLoads.front().isUndef();
7359 Match &= !RepeatedLoads.back().isUndef();
7360 if (!Match)
7361 continue;
7362
7363 EVT RepeatVT =
7364 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7365 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7366 : EVT::getFloatingPointVT(ScalarSize);
7367 if (RepeatSize > ScalarSize)
7368 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7369 RepeatSize / ScalarSize);
7370 EVT BroadcastVT =
7371 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7372 VT.getSizeInBits() / ScalarSize);
7373 if (TLI.isTypeLegal(BroadcastVT)) {
7374 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7375 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7376 SDValue Broadcast = RepeatLoad;
7377 if (RepeatSize > ScalarSize) {
7378 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7379 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7380 } else {
7381 if (!Subtarget.hasAVX2() &&
7383 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7384 Subtarget,
7385 /*AssumeSingleUse=*/true))
7386 return SDValue();
7387 Broadcast =
7388 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7389 }
7390 return DAG.getBitcast(VT, Broadcast);
7391 }
7392 }
7393 }
7394 }
7395
7396 return SDValue();
7397}
7398
7399// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7400// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7401// are consecutive, non-overlapping, and in the right order.
7403 SelectionDAG &DAG,
7404 const X86Subtarget &Subtarget,
7405 bool IsAfterLegalize) {
7407 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7408 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7409 Elts.push_back(Elt);
7410 continue;
7411 }
7412 return SDValue();
7413 }
7414 assert(Elts.size() == VT.getVectorNumElements());
7415 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7416 IsAfterLegalize);
7417}
7418
7420 const APInt &Undefs, LLVMContext &C) {
7421 unsigned ScalarSize = VT.getScalarSizeInBits();
7422 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7423
7424 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7425 if (VT.isFloatingPoint()) {
7426 if (ScalarSize == 16)
7427 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7428 if (ScalarSize == 32)
7429 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7430 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7431 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7432 }
7433 return Constant::getIntegerValue(Ty, Val);
7434 };
7435
7436 SmallVector<Constant *, 32> ConstantVec;
7437 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7438 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7439 : getConstantScalar(Bits[I]));
7440
7441 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7442}
7443
7444static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7445 unsigned SplatBitSize, LLVMContext &C) {
7446 unsigned ScalarSize = VT.getScalarSizeInBits();
7447
7448 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7449 if (VT.isFloatingPoint()) {
7450 if (ScalarSize == 16)
7451 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7452 if (ScalarSize == 32)
7453 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7454 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7455 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7456 }
7457 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7458 };
7459
7460 if (ScalarSize == SplatBitSize)
7461 return getConstantScalar(SplatValue);
7462
7463 unsigned NumElm = SplatBitSize / ScalarSize;
7464 SmallVector<Constant *, 32> ConstantVec;
7465 for (unsigned I = 0; I != NumElm; ++I) {
7466 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7467 ConstantVec.push_back(getConstantScalar(Val));
7468 }
7469 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7470}
7471
7473 for (auto *U : N->users()) {
7474 unsigned Opc = U->getOpcode();
7475 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7476 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7477 return false;
7478 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7479 return false;
7480 if (isTargetShuffle(Opc))
7481 return true;
7482 if (Opc == ISD::BITCAST) // Ignore bitcasts
7483 return isFoldableUseOfShuffle(U);
7484 if (N->hasOneUse()) {
7485 // TODO, there may be some general way to know if a SDNode can
7486 // be folded. We now only know whether an MI is foldable.
7487 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7488 return false;
7489 return true;
7490 }
7491 }
7492 return false;
7493}
7494
7495/// Attempt to use the vbroadcast instruction to generate a splat value
7496/// from a splat BUILD_VECTOR which uses:
7497/// a. A single scalar load, or a constant.
7498/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7499///
7500/// The VBROADCAST node is returned when a pattern is found,
7501/// or SDValue() otherwise.
7503 const SDLoc &dl,
7504 const X86Subtarget &Subtarget,
7505 SelectionDAG &DAG) {
7506 // VBROADCAST requires AVX.
7507 // TODO: Splats could be generated for non-AVX CPUs using SSE
7508 // instructions, but there's less potential gain for only 128-bit vectors.
7509 if (!Subtarget.hasAVX())
7510 return SDValue();
7511
7512 MVT VT = BVOp->getSimpleValueType(0);
7513 unsigned NumElts = VT.getVectorNumElements();
7514 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7515 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7516 "Unsupported vector type for broadcast.");
7517
7518 // See if the build vector is a repeating sequence of scalars (inc. splat).
7519 SDValue Ld;
7520 BitVector UndefElements;
7521 SmallVector<SDValue, 16> Sequence;
7522 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7523 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7524 if (Sequence.size() == 1)
7525 Ld = Sequence[0];
7526 }
7527
7528 // Attempt to use VBROADCASTM
7529 // From this pattern:
7530 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7531 // b. t1 = (build_vector t0 t0)
7532 //
7533 // Create (VBROADCASTM v2i1 X)
7534 if (!Sequence.empty() && Subtarget.hasCDI()) {
7535 // If not a splat, are the upper sequence values zeroable?
7536 unsigned SeqLen = Sequence.size();
7537 bool UpperZeroOrUndef =
7538 SeqLen == 1 ||
7539 llvm::all_of(ArrayRef(Sequence).drop_front(),
7540 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7541 SDValue Op0 = Sequence[0];
7542 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7543 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7544 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7545 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7546 ? Op0.getOperand(0)
7547 : Op0.getOperand(0).getOperand(0);
7548 MVT MaskVT = BOperand.getSimpleValueType();
7549 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7550 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7551 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7552 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7553 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7554 unsigned Scale = 512 / VT.getSizeInBits();
7555 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7556 }
7557 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7558 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7559 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7560 return DAG.getBitcast(VT, Bcst);
7561 }
7562 }
7563 }
7564
7565 unsigned NumUndefElts = UndefElements.count();
7566 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7567 APInt SplatValue, Undef;
7568 unsigned SplatBitSize;
7569 bool HasUndef;
7570 // Check if this is a repeated constant pattern suitable for broadcasting.
7571 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7572 SplatBitSize > VT.getScalarSizeInBits() &&
7573 SplatBitSize < VT.getSizeInBits()) {
7574 // Avoid replacing with broadcast when it's a use of a shuffle
7575 // instruction to preserve the present custom lowering of shuffles.
7576 if (isFoldableUseOfShuffle(BVOp))
7577 return SDValue();
7578 // replace BUILD_VECTOR with broadcast of the repeated constants.
7579 LLVMContext *Ctx = DAG.getContext();
7580 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7581 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7582 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7583 // Load the constant scalar/subvector and broadcast it.
7584 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7585 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7586 SDValue CP = DAG.getConstantPool(C, PVT);
7587 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7588
7589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7590 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7591 SDValue Ops[] = {DAG.getEntryNode(), CP};
7592 MachinePointerInfo MPI =
7594 SDValue Brdcst =
7595 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7596 MPI, Alignment, MachineMemOperand::MOLoad);
7597 return DAG.getBitcast(VT, Brdcst);
7598 }
7599 if (SplatBitSize > 64) {
7600 // Load the vector of constants and broadcast it.
7601 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7602 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7603 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7604 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7605 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7606 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7607 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7608 MachinePointerInfo MPI =
7611 Ops, VVT, MPI, Alignment,
7613 }
7614 }
7615
7616 // If we are moving a scalar into a vector (Ld must be set and all elements
7617 // but 1 are undef) and that operation is not obviously supported by
7618 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7619 // That's better than general shuffling and may eliminate a load to GPR and
7620 // move from scalar to vector register.
7621 if (!Ld || NumElts - NumUndefElts != 1)
7622 return SDValue();
7623 unsigned ScalarSize = Ld.getValueSizeInBits();
7624 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7625 return SDValue();
7626 }
7627
7628 bool ConstSplatVal =
7629 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7630 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7631
7632 // TODO: Handle broadcasts of non-constant sequences.
7633
7634 // Make sure that all of the users of a non-constant load are from the
7635 // BUILD_VECTOR node.
7636 // FIXME: Is the use count needed for non-constant, non-load case?
7637 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7638 return SDValue();
7639
7640 unsigned ScalarSize = Ld.getValueSizeInBits();
7641 bool IsGE256 = (VT.getSizeInBits() >= 256);
7642
7643 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7644 // instruction to save 8 or more bytes of constant pool data.
7645 // TODO: If multiple splats are generated to load the same constant,
7646 // it may be detrimental to overall size. There needs to be a way to detect
7647 // that condition to know if this is truly a size win.
7648 bool OptForSize = DAG.shouldOptForSize();
7649
7650 // Handle broadcasting a single constant scalar from the constant pool
7651 // into a vector.
7652 // On Sandybridge (no AVX2), it is still better to load a constant vector
7653 // from the constant pool and not to broadcast it from a scalar.
7654 // But override that restriction when optimizing for size.
7655 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7656 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7657 EVT CVT = Ld.getValueType();
7658 assert(!CVT.isVector() && "Must not broadcast a vector type");
7659
7660 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7661 // For size optimization, also splat v2f64 and v2i64, and for size opt
7662 // with AVX2, also splat i8 and i16.
7663 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7664 if (ScalarSize == 32 ||
7665 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7666 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7667 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7668 const Constant *C = nullptr;
7669 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7670 C = CI->getConstantIntValue();
7671 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7672 C = CF->getConstantFPValue();
7673
7674 assert(C && "Invalid constant type");
7675
7676 SDValue CP =
7678 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7679
7680 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7681 SDValue Ops[] = {DAG.getEntryNode(), CP};
7682 MachinePointerInfo MPI =
7684 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7685 MPI, Alignment, MachineMemOperand::MOLoad);
7686 }
7687 }
7688
7689 // Handle AVX2 in-register broadcasts.
7690 if (!IsLoad && Subtarget.hasInt256() &&
7691 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7692 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7693
7694 // The scalar source must be a normal load.
7695 if (!IsLoad)
7696 return SDValue();
7697
7698 // Make sure the non-chain result is only used by this build vector.
7699 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7700 return SDValue();
7701
7702 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7703 (Subtarget.hasVLX() && ScalarSize == 64)) {
7704 auto *LN = cast<LoadSDNode>(Ld);
7705 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7706 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7707 SDValue BCast =
7709 LN->getMemoryVT(), LN->getMemOperand());
7710 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7711 return BCast;
7712 }
7713
7714 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7715 // double since there is no vbroadcastsd xmm
7716 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7717 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7718 auto *LN = cast<LoadSDNode>(Ld);
7719 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7720 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7721 SDValue BCast =
7723 LN->getMemoryVT(), LN->getMemOperand());
7724 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7725 return BCast;
7726 }
7727
7728 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7729 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7730
7731 // Unsupported broadcast.
7732 return SDValue();
7733}
7734
7735/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7736/// underlying vector and index.
7737///
7738/// Modifies \p ExtractedFromVec to the real vector and returns the real
7739/// index.
7740static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7741 SDValue ExtIdx) {
7742 int Idx = ExtIdx->getAsZExtVal();
7743 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7744 return Idx;
7745
7746 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7747 // lowered this:
7748 // (extract_vector_elt (v8f32 %1), Constant<6>)
7749 // to:
7750 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7751 // (extract_subvector (v8f32 %0), Constant<4>),
7752 // undef)
7753 // Constant<0>)
7754 // In this case the vector is the extract_subvector expression and the index
7755 // is 2, as specified by the shuffle.
7756 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7757 SDValue ShuffleVec = SVOp->getOperand(0);
7758 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7759 assert(ShuffleVecVT.getVectorElementType() ==
7760 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7761
7762 int ShuffleIdx = SVOp->getMaskElt(Idx);
7763 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7764 ExtractedFromVec = ShuffleVec;
7765 return ShuffleIdx;
7766 }
7767 return Idx;
7768}
7769
7771 SelectionDAG &DAG) {
7772 MVT VT = Op.getSimpleValueType();
7773
7774 // Skip if insert_vec_elt is not supported.
7775 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7777 return SDValue();
7778
7779 unsigned NumElems = Op.getNumOperands();
7780 SDValue VecIn1;
7781 SDValue VecIn2;
7782 SmallVector<unsigned, 4> InsertIndices;
7783 SmallVector<int, 8> Mask(NumElems, -1);
7784
7785 for (unsigned i = 0; i != NumElems; ++i) {
7786 unsigned Opc = Op.getOperand(i).getOpcode();
7787
7788 if (Opc == ISD::UNDEF)
7789 continue;
7790
7791 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7792 // Quit if more than 1 elements need inserting.
7793 if (InsertIndices.size() > 1)
7794 return SDValue();
7795
7796 InsertIndices.push_back(i);
7797 continue;
7798 }
7799
7800 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7801 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7802
7803 // Quit if non-constant index.
7804 if (!isa<ConstantSDNode>(ExtIdx))
7805 return SDValue();
7806 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7807
7808 // Quit if extracted from vector of different type.
7809 if (ExtractedFromVec.getValueType() != VT)
7810 return SDValue();
7811
7812 if (!VecIn1.getNode())
7813 VecIn1 = ExtractedFromVec;
7814 else if (VecIn1 != ExtractedFromVec) {
7815 if (!VecIn2.getNode())
7816 VecIn2 = ExtractedFromVec;
7817 else if (VecIn2 != ExtractedFromVec)
7818 // Quit if more than 2 vectors to shuffle
7819 return SDValue();
7820 }
7821
7822 if (ExtractedFromVec == VecIn1)
7823 Mask[i] = Idx;
7824 else if (ExtractedFromVec == VecIn2)
7825 Mask[i] = Idx + NumElems;
7826 }
7827
7828 if (!VecIn1.getNode())
7829 return SDValue();
7830
7831 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7832 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7833
7834 for (unsigned Idx : InsertIndices)
7835 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7837
7838 return NV;
7839}
7840
7841// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7843 const X86Subtarget &Subtarget) {
7844 MVT VT = Op.getSimpleValueType();
7845 MVT IVT =
7846 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7848 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7849 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7850 Op.getOperand(I)));
7851 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7852 return DAG.getBitcast(VT, Res);
7853}
7854
7855// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7857 SelectionDAG &DAG,
7858 const X86Subtarget &Subtarget) {
7859
7860 MVT VT = Op.getSimpleValueType();
7861 assert((VT.getVectorElementType() == MVT::i1) &&
7862 "Unexpected type in LowerBUILD_VECTORvXi1!");
7863 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7864 ISD::isBuildVectorAllOnes(Op.getNode()))
7865 return Op;
7866
7867 uint64_t Immediate = 0;
7868 SmallVector<unsigned, 16> NonConstIdx;
7869 bool IsSplat = true;
7870 bool HasConstElts = false;
7871 int SplatIdx = -1;
7872 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7873 SDValue In = Op.getOperand(idx);
7874 if (In.isUndef())
7875 continue;
7876 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7877 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7878 HasConstElts = true;
7879 } else {
7880 NonConstIdx.push_back(idx);
7881 }
7882 if (SplatIdx < 0)
7883 SplatIdx = idx;
7884 else if (In != Op.getOperand(SplatIdx))
7885 IsSplat = false;
7886 }
7887
7888 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7889 if (IsSplat) {
7890 // The build_vector allows the scalar element to be larger than the vector
7891 // element type. We need to mask it to use as a condition unless we know
7892 // the upper bits are zero.
7893 // FIXME: Use computeKnownBits instead of checking specific opcode?
7894 SDValue Cond = Op.getOperand(SplatIdx);
7895 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7896 if (Cond.getOpcode() != ISD::SETCC)
7897 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7898 DAG.getConstant(1, dl, MVT::i8));
7899
7900 // Perform the select in the scalar domain so we can use cmov.
7901 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7902 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7903 DAG.getAllOnesConstant(dl, MVT::i32),
7904 DAG.getConstant(0, dl, MVT::i32));
7905 Select = DAG.getBitcast(MVT::v32i1, Select);
7906 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7907 } else {
7908 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7909 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7910 DAG.getAllOnesConstant(dl, ImmVT),
7911 DAG.getConstant(0, dl, ImmVT));
7912 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7913 Select = DAG.getBitcast(VecVT, Select);
7914 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7915 DAG.getVectorIdxConstant(0, dl));
7916 }
7917 }
7918
7919 // insert elements one by one
7920 SDValue DstVec;
7921 if (HasConstElts) {
7922 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7923 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7924 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7925 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7926 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7927 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7928 } else {
7929 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7930 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7931 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7932 DstVec = DAG.getBitcast(VecVT, Imm);
7933 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7934 DAG.getVectorIdxConstant(0, dl));
7935 }
7936 } else
7937 DstVec = DAG.getUNDEF(VT);
7938
7939 for (unsigned InsertIdx : NonConstIdx) {
7940 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7941 Op.getOperand(InsertIdx),
7942 DAG.getVectorIdxConstant(InsertIdx, dl));
7943 }
7944 return DstVec;
7945}
7946
7947LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7948 switch (Opcode) {
7949 case X86ISD::PACKSS:
7950 case X86ISD::PACKUS:
7951 case X86ISD::FHADD:
7952 case X86ISD::FHSUB:
7953 case X86ISD::HADD:
7954 case X86ISD::HSUB:
7955 return true;
7956 }
7957 return false;
7958}
7959
7960/// This is a helper function of LowerToHorizontalOp().
7961/// This function checks that the build_vector \p N in input implements a
7962/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7963/// may not match the layout of an x86 256-bit horizontal instruction.
7964/// In other words, if this returns true, then some extraction/insertion will
7965/// be required to produce a valid horizontal instruction.
7966///
7967/// Parameter \p Opcode defines the kind of horizontal operation to match.
7968/// For example, if \p Opcode is equal to ISD::ADD, then this function
7969/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7970/// is equal to ISD::SUB, then this function checks if this is a horizontal
7971/// arithmetic sub.
7972///
7973/// This function only analyzes elements of \p N whose indices are
7974/// in range [BaseIdx, LastIdx).
7975///
7976/// TODO: This function was originally used to match both real and fake partial
7977/// horizontal operations, but the index-matching logic is incorrect for that.
7978/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7979/// code because it is only used for partial h-op matching now?
7980static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7981 const SDLoc &DL, SelectionDAG &DAG,
7982 unsigned BaseIdx, unsigned LastIdx,
7983 SDValue &V0, SDValue &V1) {
7984 EVT VT = N->getValueType(0);
7985 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7986 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7987 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7988 "Invalid Vector in input!");
7989
7990 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7991 bool CanFold = true;
7992 unsigned ExpectedVExtractIdx = BaseIdx;
7993 unsigned NumElts = LastIdx - BaseIdx;
7994 V0 = DAG.getUNDEF(VT);
7995 V1 = DAG.getUNDEF(VT);
7996
7997 // Check if N implements a horizontal binop.
7998 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7999 SDValue Op = N->getOperand(i + BaseIdx);
8000
8001 // Skip UNDEFs.
8002 if (Op->isUndef()) {
8003 // Update the expected vector extract index.
8004 if (i * 2 == NumElts)
8005 ExpectedVExtractIdx = BaseIdx;
8006 ExpectedVExtractIdx += 2;
8007 continue;
8008 }
8009
8010 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8011
8012 if (!CanFold)
8013 break;
8014
8015 SDValue Op0 = Op.getOperand(0);
8016 SDValue Op1 = Op.getOperand(1);
8017
8018 // Try to match the following pattern:
8019 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8020 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8022 Op0.getOperand(0) == Op1.getOperand(0) &&
8023 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8024 isa<ConstantSDNode>(Op1.getOperand(1)));
8025 if (!CanFold)
8026 break;
8027
8028 unsigned I0 = Op0.getConstantOperandVal(1);
8029 unsigned I1 = Op1.getConstantOperandVal(1);
8030
8031 if (i * 2 < NumElts) {
8032 if (V0.isUndef()) {
8033 V0 = Op0.getOperand(0);
8034 if (V0.getValueType() != VT)
8035 return false;
8036 }
8037 } else {
8038 if (V1.isUndef()) {
8039 V1 = Op0.getOperand(0);
8040 if (V1.getValueType() != VT)
8041 return false;
8042 }
8043 if (i * 2 == NumElts)
8044 ExpectedVExtractIdx = BaseIdx;
8045 }
8046
8047 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8048 if (I0 == ExpectedVExtractIdx)
8049 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8050 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8051 // Try to match the following dag sequence:
8052 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8053 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8054 } else
8055 CanFold = false;
8056
8057 ExpectedVExtractIdx += 2;
8058 }
8059
8060 return CanFold;
8061}
8062
8063/// Emit a sequence of two 128-bit horizontal add/sub followed by
8064/// a concat_vector.
8065///
8066/// This is a helper function of LowerToHorizontalOp().
8067/// This function expects two 256-bit vectors called V0 and V1.
8068/// At first, each vector is split into two separate 128-bit vectors.
8069/// Then, the resulting 128-bit vectors are used to implement two
8070/// horizontal binary operations.
8071///
8072/// The kind of horizontal binary operation is defined by \p X86Opcode.
8073///
8074/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8075/// the two new horizontal binop.
8076/// When Mode is set, the first horizontal binop dag node would take as input
8077/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8078/// horizontal binop dag node would take as input the lower 128-bit of V1
8079/// and the upper 128-bit of V1.
8080/// Example:
8081/// HADD V0_LO, V0_HI
8082/// HADD V1_LO, V1_HI
8083///
8084/// Otherwise, the first horizontal binop dag node takes as input the lower
8085/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8086/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8087/// Example:
8088/// HADD V0_LO, V1_LO
8089/// HADD V0_HI, V1_HI
8090///
8091/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8092/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8093/// the upper 128-bits of the result.
8094static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8095 const SDLoc &DL, SelectionDAG &DAG,
8096 unsigned X86Opcode, bool Mode,
8097 bool isUndefLO, bool isUndefHI) {
8098 MVT VT = V0.getSimpleValueType();
8099 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8100 "Invalid nodes in input!");
8101
8102 unsigned NumElts = VT.getVectorNumElements();
8103 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8104 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8105 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8106 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8107 MVT NewVT = V0_LO.getSimpleValueType();
8108
8109 SDValue LO = DAG.getUNDEF(NewVT);
8110 SDValue HI = DAG.getUNDEF(NewVT);
8111
8112 if (Mode) {
8113 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8114 if (!isUndefLO && !V0->isUndef())
8115 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8116 if (!isUndefHI && !V1->isUndef())
8117 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8118 } else {
8119 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8120 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8121 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8122
8123 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8124 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8125 }
8126
8127 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8128}
8129
8130/// Returns true iff \p BV builds a vector with the result equivalent to
8131/// the result of ADDSUB/SUBADD operation.
8132/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8133/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8134/// \p Opnd0 and \p Opnd1.
8136 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8137 SDValue &Opnd0, SDValue &Opnd1,
8138 unsigned &NumExtracts,
8139 bool &IsSubAdd) {
8140
8141 MVT VT = BV->getSimpleValueType(0);
8142 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8143 return false;
8144
8145 unsigned NumElts = VT.getVectorNumElements();
8146 SDValue InVec0 = DAG.getUNDEF(VT);
8147 SDValue InVec1 = DAG.getUNDEF(VT);
8148
8149 NumExtracts = 0;
8150
8151 // Odd-numbered elements in the input build vector are obtained from
8152 // adding/subtracting two integer/float elements.
8153 // Even-numbered elements in the input build vector are obtained from
8154 // subtracting/adding two integer/float elements.
8155 unsigned Opc[2] = {0, 0};
8156 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8157 SDValue Op = BV->getOperand(i);
8158
8159 // Skip 'undef' values.
8160 unsigned Opcode = Op.getOpcode();
8161 if (Opcode == ISD::UNDEF)
8162 continue;
8163
8164 // Early exit if we found an unexpected opcode.
8165 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8166 return false;
8167
8168 SDValue Op0 = Op.getOperand(0);
8169 SDValue Op1 = Op.getOperand(1);
8170
8171 // Try to match the following pattern:
8172 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8173 // Early exit if we cannot match that sequence.
8174 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8176 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8177 Op0.getOperand(1) != Op1.getOperand(1))
8178 return false;
8179
8180 unsigned I0 = Op0.getConstantOperandVal(1);
8181 if (I0 != i)
8182 return false;
8183
8184 // We found a valid add/sub node, make sure its the same opcode as previous
8185 // elements for this parity.
8186 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8187 return false;
8188 Opc[i % 2] = Opcode;
8189
8190 // Update InVec0 and InVec1.
8191 if (InVec0.isUndef()) {
8192 InVec0 = Op0.getOperand(0);
8193 if (InVec0.getSimpleValueType() != VT)
8194 return false;
8195 }
8196 if (InVec1.isUndef()) {
8197 InVec1 = Op1.getOperand(0);
8198 if (InVec1.getSimpleValueType() != VT)
8199 return false;
8200 }
8201
8202 // Make sure that operands in input to each add/sub node always
8203 // come from a same pair of vectors.
8204 if (InVec0 != Op0.getOperand(0)) {
8205 if (Opcode == ISD::FSUB)
8206 return false;
8207
8208 // FADD is commutable. Try to commute the operands
8209 // and then test again.
8210 std::swap(Op0, Op1);
8211 if (InVec0 != Op0.getOperand(0))
8212 return false;
8213 }
8214
8215 if (InVec1 != Op1.getOperand(0))
8216 return false;
8217
8218 // Increment the number of extractions done.
8219 ++NumExtracts;
8220 }
8221
8222 // Ensure we have found an opcode for both parities and that they are
8223 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8224 // inputs are undef.
8225 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8226 InVec0.isUndef() || InVec1.isUndef())
8227 return false;
8228
8229 IsSubAdd = Opc[0] == ISD::FADD;
8230
8231 Opnd0 = InVec0;
8232 Opnd1 = InVec1;
8233 return true;
8234}
8235
8236/// Returns true if is possible to fold MUL and an idiom that has already been
8237/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8238/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8239/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8240///
8241/// Prior to calling this function it should be known that there is some
8242/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8243/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8244/// before replacement of such SDNode with ADDSUB operation. Thus the number
8245/// of \p Opnd0 uses is expected to be equal to 2.
8246/// For example, this function may be called for the following IR:
8247/// %AB = fmul fast <2 x double> %A, %B
8248/// %Sub = fsub fast <2 x double> %AB, %C
8249/// %Add = fadd fast <2 x double> %AB, %C
8250/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8251/// <2 x i32> <i32 0, i32 3>
8252/// There is a def for %Addsub here, which potentially can be replaced by
8253/// X86ISD::ADDSUB operation:
8254/// %Addsub = X86ISD::ADDSUB %AB, %C
8255/// and such ADDSUB can further be replaced with FMADDSUB:
8256/// %Addsub = FMADDSUB %A, %B, %C.
8257///
8258/// The main reason why this method is called before the replacement of the
8259/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8260/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8261/// FMADDSUB is.
8262static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8263 SelectionDAG &DAG,
8264 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8265 unsigned ExpectedUses) {
8266 if (Opnd0.getOpcode() != ISD::FMUL ||
8267 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8268 return false;
8269
8270 // FIXME: These checks must match the similar ones in
8271 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8272 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8273 // or MUL + ADDSUB to FMADDSUB.
8274 const TargetOptions &Options = DAG.getTarget().Options;
8275 bool AllowFusion =
8276 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8277 if (!AllowFusion)
8278 return false;
8279
8280 Opnd2 = Opnd1;
8281 Opnd1 = Opnd0.getOperand(1);
8282 Opnd0 = Opnd0.getOperand(0);
8283
8284 return true;
8285}
8286
8287/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8288/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8289/// X86ISD::FMSUBADD node.
8291 const SDLoc &DL,
8292 const X86Subtarget &Subtarget,
8293 SelectionDAG &DAG) {
8294 SDValue Opnd0, Opnd1;
8295 unsigned NumExtracts;
8296 bool IsSubAdd;
8297 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8298 IsSubAdd))
8299 return SDValue();
8300
8301 MVT VT = BV->getSimpleValueType(0);
8302
8303 // Try to generate X86ISD::FMADDSUB node here.
8304 SDValue Opnd2;
8305 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8306 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8307 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8308 }
8309
8310 // We only support ADDSUB.
8311 if (IsSubAdd)
8312 return SDValue();
8313
8314 // There are no known X86 targets with 512-bit ADDSUB instructions!
8315 // Convert to blend(fsub,fadd).
8316 if (VT.is512BitVector()) {
8317 SmallVector<int> Mask;
8318 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8319 Mask.push_back(I);
8320 Mask.push_back(I + E + 1);
8321 }
8322 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8323 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8324 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8325 }
8326
8327 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8328}
8329
8331 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8332 // Initialize outputs to known values.
8333 MVT VT = BV->getSimpleValueType(0);
8334 HOpcode = ISD::DELETED_NODE;
8335 V0 = DAG.getUNDEF(VT);
8336 V1 = DAG.getUNDEF(VT);
8337
8338 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8339 // half of the result is calculated independently from the 128-bit halves of
8340 // the inputs, so that makes the index-checking logic below more complicated.
8341 unsigned NumElts = VT.getVectorNumElements();
8342 unsigned GenericOpcode = ISD::DELETED_NODE;
8343 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8344 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8345 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8346 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8347 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8348 // Ignore undef elements.
8349 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8350 if (Op.isUndef())
8351 continue;
8352
8353 // If there's an opcode mismatch, we're done.
8354 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8355 return false;
8356
8357 // Initialize horizontal opcode.
8358 if (HOpcode == ISD::DELETED_NODE) {
8359 GenericOpcode = Op.getOpcode();
8360 switch (GenericOpcode) {
8361 // clang-format off
8362 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8363 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8364 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8365 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8366 default: return false;
8367 // clang-format on
8368 }
8369 }
8370
8371 SDValue Op0 = Op.getOperand(0);
8372 SDValue Op1 = Op.getOperand(1);
8373 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8375 Op0.getOperand(0) != Op1.getOperand(0) ||
8376 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8377 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8378 return false;
8379
8380 // The source vector is chosen based on which 64-bit half of the
8381 // destination vector is being calculated.
8382 if (j < NumEltsIn64Bits) {
8383 if (V0.isUndef())
8384 V0 = Op0.getOperand(0);
8385 } else {
8386 if (V1.isUndef())
8387 V1 = Op0.getOperand(0);
8388 }
8389
8390 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8391 if (SourceVec != Op0.getOperand(0))
8392 return false;
8393
8394 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8395 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8396 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8397 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8398 (j % NumEltsIn64Bits) * 2;
8399 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8400 continue;
8401
8402 // If this is not a commutative op, this does not match.
8403 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8404 return false;
8405
8406 // Addition is commutative, so try swapping the extract indexes.
8407 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8408 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8409 continue;
8410
8411 // Extract indexes do not match horizontal requirement.
8412 return false;
8413 }
8414 }
8415 // We matched. Opcode and operands are returned by reference as arguments.
8416 return true;
8417}
8418
8420 const SDLoc &DL, SelectionDAG &DAG,
8421 unsigned HOpcode, SDValue V0, SDValue V1) {
8422 // If either input vector is not the same size as the build vector,
8423 // extract/insert the low bits to the correct size.
8424 // This is free (examples: zmm --> xmm, xmm --> ymm).
8425 MVT VT = BV->getSimpleValueType(0);
8426 unsigned Width = VT.getSizeInBits();
8427 if (V0.getValueSizeInBits() > Width)
8428 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8429 else if (V0.getValueSizeInBits() < Width)
8430 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8431
8432 if (V1.getValueSizeInBits() > Width)
8433 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8434 else if (V1.getValueSizeInBits() < Width)
8435 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8436
8437 unsigned NumElts = VT.getVectorNumElements();
8438 APInt DemandedElts = APInt::getAllOnes(NumElts);
8439 for (unsigned i = 0; i != NumElts; ++i)
8440 if (BV->getOperand(i).isUndef())
8441 DemandedElts.clearBit(i);
8442
8443 // If we don't need the upper xmm, then perform as a xmm hop.
8444 unsigned HalfNumElts = NumElts / 2;
8445 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8446 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8447 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8448 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8449 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8450 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8451 }
8452
8453 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8454}
8455
8456/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8458 const X86Subtarget &Subtarget,
8459 SelectionDAG &DAG) {
8460 // We need at least 2 non-undef elements to make this worthwhile by default.
8461 unsigned NumNonUndefs =
8462 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8463 if (NumNonUndefs < 2)
8464 return SDValue();
8465
8466 // There are 4 sets of horizontal math operations distinguished by type:
8467 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8468 // subtarget feature. Try to match those "native" patterns first.
8469 MVT VT = BV->getSimpleValueType(0);
8470 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8471 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8472 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8473 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8474 unsigned HOpcode;
8475 SDValue V0, V1;
8476 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8477 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8478 }
8479
8480 // Try harder to match 256-bit ops by using extract/concat.
8481 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8482 return SDValue();
8483
8484 // Count the number of UNDEF operands in the build_vector in input.
8485 unsigned NumElts = VT.getVectorNumElements();
8486 unsigned Half = NumElts / 2;
8487 unsigned NumUndefsLO = 0;
8488 unsigned NumUndefsHI = 0;
8489 for (unsigned i = 0, e = Half; i != e; ++i)
8490 if (BV->getOperand(i)->isUndef())
8491 NumUndefsLO++;
8492
8493 for (unsigned i = Half, e = NumElts; i != e; ++i)
8494 if (BV->getOperand(i)->isUndef())
8495 NumUndefsHI++;
8496
8497 SDValue InVec0, InVec1;
8498 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8499 SDValue InVec2, InVec3;
8500 unsigned X86Opcode;
8501 bool CanFold = true;
8502
8503 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8504 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8505 InVec3) &&
8506 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8507 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8508 X86Opcode = X86ISD::HADD;
8509 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8510 InVec1) &&
8511 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8512 InVec3) &&
8513 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8514 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8515 X86Opcode = X86ISD::HSUB;
8516 else
8517 CanFold = false;
8518
8519 if (CanFold) {
8520 // Do not try to expand this build_vector into a pair of horizontal
8521 // add/sub if we can emit a pair of scalar add/sub.
8522 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8523 return SDValue();
8524
8525 // Convert this build_vector into a pair of horizontal binops followed by
8526 // a concat vector. We must adjust the outputs from the partial horizontal
8527 // matching calls above to account for undefined vector halves.
8528 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8529 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8530 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8531 bool isUndefLO = NumUndefsLO == Half;
8532 bool isUndefHI = NumUndefsHI == Half;
8533 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8534 isUndefHI);
8535 }
8536 }
8537
8538 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8539 VT == MVT::v16i16) {
8540 unsigned X86Opcode;
8541 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8542 InVec1))
8543 X86Opcode = X86ISD::HADD;
8544 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8545 InVec1))
8546 X86Opcode = X86ISD::HSUB;
8547 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8548 InVec1))
8549 X86Opcode = X86ISD::FHADD;
8550 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8551 InVec1))
8552 X86Opcode = X86ISD::FHSUB;
8553 else
8554 return SDValue();
8555
8556 // Don't try to expand this build_vector into a pair of horizontal add/sub
8557 // if we can simply emit a pair of scalar add/sub.
8558 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8559 return SDValue();
8560
8561 // Convert this build_vector into two horizontal add/sub followed by
8562 // a concat vector.
8563 bool isUndefLO = NumUndefsLO == Half;
8564 bool isUndefHI = NumUndefsHI == Half;
8565 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8566 isUndefLO, isUndefHI);
8567 }
8568
8569 return SDValue();
8570}
8571
8572static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8573 SelectionDAG &DAG);
8574
8575/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8576/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8577/// just apply the bit to the vectors.
8578/// NOTE: Its not in our interest to start make a general purpose vectorizer
8579/// from this, but enough scalar bit operations are created from the later
8580/// legalization + scalarization stages to need basic support.
8582 const X86Subtarget &Subtarget,
8583 SelectionDAG &DAG) {
8584 MVT VT = Op->getSimpleValueType(0);
8585 unsigned NumElems = VT.getVectorNumElements();
8586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8587
8588 // Check that all elements have the same opcode.
8589 // TODO: Should we allow UNDEFS and if so how many?
8590 unsigned Opcode = Op->getOperand(0).getOpcode();
8591 for (unsigned i = 1; i < NumElems; ++i)
8592 if (Opcode != Op->getOperand(i).getOpcode())
8593 return SDValue();
8594
8595 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8596 bool IsShift = false;
8597 switch (Opcode) {
8598 default:
8599 return SDValue();
8600 case ISD::SHL:
8601 case ISD::SRL:
8602 case ISD::SRA:
8603 IsShift = true;
8604 break;
8605 case ISD::AND:
8606 case ISD::XOR:
8607 case ISD::OR:
8608 // Don't do this if the buildvector is a splat - we'd replace one
8609 // constant with an entire vector.
8610 if (Op->getSplatValue())
8611 return SDValue();
8612 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8613 return SDValue();
8614 break;
8615 }
8616
8617 SmallVector<SDValue, 4> LHSElts, RHSElts;
8618 for (SDValue Elt : Op->ops()) {
8619 SDValue LHS = Elt.getOperand(0);
8620 SDValue RHS = Elt.getOperand(1);
8621
8622 // We expect the canonicalized RHS operand to be the constant.
8623 if (!isa<ConstantSDNode>(RHS))
8624 return SDValue();
8625
8626 // Extend shift amounts.
8627 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8628 if (!IsShift)
8629 return SDValue();
8630 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8631 }
8632
8633 LHSElts.push_back(LHS);
8634 RHSElts.push_back(RHS);
8635 }
8636
8637 // Limit to shifts by uniform immediates.
8638 // TODO: Only accept vXi8/vXi64 special cases?
8639 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8640 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8641 return SDValue();
8642
8643 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8644 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8645 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8646
8647 if (!IsShift)
8648 return Res;
8649
8650 // Immediately lower the shift to ensure the constant build vector doesn't
8651 // get converted to a constant pool before the shift is lowered.
8652 return LowerShift(Res, Subtarget, DAG);
8653}
8654
8655/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8656/// functionality to do this, so it's all zeros, all ones, or some derivation
8657/// that is cheap to calculate.
8659 SelectionDAG &DAG,
8660 const X86Subtarget &Subtarget) {
8661 MVT VT = Op.getSimpleValueType();
8662
8663 // Vectors containing all zeros can be matched by pxor and xorps.
8664 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8665 return Op;
8666
8667 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8668 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8669 // vpcmpeqd on 256-bit vectors.
8670 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8671 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8672 return Op;
8673
8674 return getOnesVector(VT, DAG, DL);
8675 }
8676
8677 return SDValue();
8678}
8679
8680/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8681/// from a vector of source values and a vector of extraction indices.
8682/// The vectors might be manipulated to match the type of the permute op.
8683static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8684 const SDLoc &DL, SelectionDAG &DAG,
8685 const X86Subtarget &Subtarget) {
8686 MVT ShuffleVT = VT;
8687 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8688 unsigned NumElts = VT.getVectorNumElements();
8689 unsigned SizeInBits = VT.getSizeInBits();
8690
8691 // Adjust IndicesVec to match VT size.
8692 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8693 "Illegal variable permute mask size");
8694 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8695 // Narrow/widen the indices vector to the correct size.
8696 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8697 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8698 NumElts * VT.getScalarSizeInBits());
8699 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8700 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8701 SDLoc(IndicesVec), SizeInBits);
8702 // Zero-extend the index elements within the vector.
8703 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8704 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8705 IndicesVT, IndicesVec);
8706 }
8707 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8708
8709 // Handle SrcVec that don't match VT type.
8710 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8711 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8712 // Handle larger SrcVec by treating it as a larger permute.
8713 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8714 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8715 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8716 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8717 Subtarget, DAG, SDLoc(IndicesVec));
8718 SDValue NewSrcVec =
8719 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8720 if (NewSrcVec)
8721 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8722 return SDValue();
8723 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8724 // Widen smaller SrcVec to match VT.
8725 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8726 } else
8727 return SDValue();
8728 }
8729
8730 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8731 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8732 EVT SrcVT = Idx.getValueType();
8733 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8734 uint64_t IndexScale = 0;
8735 uint64_t IndexOffset = 0;
8736
8737 // If we're scaling a smaller permute op, then we need to repeat the
8738 // indices, scaling and offsetting them as well.
8739 // e.g. v4i32 -> v16i8 (Scale = 4)
8740 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8741 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8742 for (uint64_t i = 0; i != Scale; ++i) {
8743 IndexScale |= Scale << (i * NumDstBits);
8744 IndexOffset |= i << (i * NumDstBits);
8745 }
8746
8747 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8748 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8749 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8750 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8751 return Idx;
8752 };
8753
8754 unsigned Opcode = 0;
8755 switch (VT.SimpleTy) {
8756 default:
8757 break;
8758 case MVT::v16i8:
8759 if (Subtarget.hasSSSE3())
8760 Opcode = X86ISD::PSHUFB;
8761 break;
8762 case MVT::v8i16:
8763 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8764 Opcode = X86ISD::VPERMV;
8765 else if (Subtarget.hasSSSE3()) {
8766 Opcode = X86ISD::PSHUFB;
8767 ShuffleVT = MVT::v16i8;
8768 }
8769 break;
8770 case MVT::v4f32:
8771 case MVT::v4i32:
8772 if (Subtarget.hasAVX()) {
8773 Opcode = X86ISD::VPERMILPV;
8774 ShuffleVT = MVT::v4f32;
8775 } else if (Subtarget.hasSSSE3()) {
8776 Opcode = X86ISD::PSHUFB;
8777 ShuffleVT = MVT::v16i8;
8778 }
8779 break;
8780 case MVT::v2f64:
8781 case MVT::v2i64:
8782 if (Subtarget.hasAVX()) {
8783 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8784 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8785 Opcode = X86ISD::VPERMILPV;
8786 ShuffleVT = MVT::v2f64;
8787 } else if (Subtarget.hasSSE41()) {
8788 // SSE41 can compare v2i64 - select between indices 0 and 1.
8789 return DAG.getSelectCC(
8790 DL, IndicesVec,
8791 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8792 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8793 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8795 }
8796 break;
8797 case MVT::v32i8:
8798 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8799 Opcode = X86ISD::VPERMV;
8800 else if (Subtarget.hasXOP()) {
8801 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8802 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8803 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8804 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8805 return DAG.getNode(
8807 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8808 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8809 } else if (Subtarget.hasAVX()) {
8810 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8811 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8812 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8813 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8814 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8815 ArrayRef<SDValue> Ops) {
8816 // Permute Lo and Hi and then select based on index range.
8817 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8818 // care about the bit[7] as its just an index vector.
8819 SDValue Idx = Ops[2];
8820 EVT VT = Idx.getValueType();
8821 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8822 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8823 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8825 };
8826 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8827 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8828 PSHUFBBuilder);
8829 }
8830 break;
8831 case MVT::v16i16:
8832 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8833 Opcode = X86ISD::VPERMV;
8834 else if (Subtarget.hasAVX()) {
8835 // Scale to v32i8 and perform as v32i8.
8836 IndicesVec = ScaleIndices(IndicesVec, 2);
8837 return DAG.getBitcast(
8839 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8840 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8841 }
8842 break;
8843 case MVT::v8f32:
8844 case MVT::v8i32:
8845 if (Subtarget.hasAVX2())
8846 Opcode = X86ISD::VPERMV;
8847 else if (Subtarget.hasAVX()) {
8848 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8849 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8850 {0, 1, 2, 3, 0, 1, 2, 3});
8851 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8852 {4, 5, 6, 7, 4, 5, 6, 7});
8853 if (Subtarget.hasXOP())
8854 return DAG.getBitcast(
8855 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8856 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8857 // Permute Lo and Hi and then select based on index range.
8858 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8859 SDValue Res = DAG.getSelectCC(
8860 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8861 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8862 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8864 return DAG.getBitcast(VT, Res);
8865 }
8866 break;
8867 case MVT::v4i64:
8868 case MVT::v4f64:
8869 if (Subtarget.hasAVX512()) {
8870 if (!Subtarget.hasVLX()) {
8871 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8872 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8873 SDLoc(SrcVec));
8874 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8875 DAG, SDLoc(IndicesVec));
8876 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8877 DAG, Subtarget);
8878 return extract256BitVector(Res, 0, DAG, DL);
8879 }
8880 Opcode = X86ISD::VPERMV;
8881 } else if (Subtarget.hasAVX()) {
8882 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8883 SDValue LoLo =
8884 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8885 SDValue HiHi =
8886 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8887 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8888 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8889 if (Subtarget.hasXOP())
8890 return DAG.getBitcast(
8891 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8892 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8893 // Permute Lo and Hi and then select based on index range.
8894 // This works as VPERMILPD only uses index bit[1] to permute elements.
8895 SDValue Res = DAG.getSelectCC(
8896 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8897 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8898 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8900 return DAG.getBitcast(VT, Res);
8901 }
8902 break;
8903 case MVT::v64i8:
8904 if (Subtarget.hasVBMI())
8905 Opcode = X86ISD::VPERMV;
8906 break;
8907 case MVT::v32i16:
8908 if (Subtarget.hasBWI())
8909 Opcode = X86ISD::VPERMV;
8910 break;
8911 case MVT::v16f32:
8912 case MVT::v16i32:
8913 case MVT::v8f64:
8914 case MVT::v8i64:
8915 if (Subtarget.hasAVX512())
8916 Opcode = X86ISD::VPERMV;
8917 break;
8918 }
8919 if (!Opcode)
8920 return SDValue();
8921
8922 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8923 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8924 "Illegal variable permute shuffle type");
8925
8926 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8927 if (Scale > 1)
8928 IndicesVec = ScaleIndices(IndicesVec, Scale);
8929
8930 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8931 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8932
8933 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8934 SDValue Res = Opcode == X86ISD::VPERMV
8935 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8936 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8937 return DAG.getBitcast(VT, Res);
8938}
8939
8940// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8941// reasoned to be a permutation of a vector by indices in a non-constant vector.
8942// (build_vector (extract_elt V, (extract_elt I, 0)),
8943// (extract_elt V, (extract_elt I, 1)),
8944// ...
8945// ->
8946// (vpermv I, V)
8947//
8948// TODO: Handle undefs
8949// TODO: Utilize pshufb and zero mask blending to support more efficient
8950// construction of vectors with constant-0 elements.
8951static SDValue
8953 SelectionDAG &DAG,
8954 const X86Subtarget &Subtarget) {
8955 SDValue SrcVec, IndicesVec;
8956 // Check for a match of the permute source vector and permute index elements.
8957 // This is done by checking that the i-th build_vector operand is of the form:
8958 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8959 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8960 SDValue Op = V.getOperand(Idx);
8961 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8962 return SDValue();
8963
8964 // If this is the first extract encountered in V, set the source vector,
8965 // otherwise verify the extract is from the previously defined source
8966 // vector.
8967 if (!SrcVec)
8968 SrcVec = Op.getOperand(0);
8969 else if (SrcVec != Op.getOperand(0))
8970 return SDValue();
8971 SDValue ExtractedIndex = Op->getOperand(1);
8972 // Peek through extends.
8973 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8974 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8975 ExtractedIndex = ExtractedIndex.getOperand(0);
8976 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8977 return SDValue();
8978
8979 // If this is the first extract from the index vector candidate, set the
8980 // indices vector, otherwise verify the extract is from the previously
8981 // defined indices vector.
8982 if (!IndicesVec)
8983 IndicesVec = ExtractedIndex.getOperand(0);
8984 else if (IndicesVec != ExtractedIndex.getOperand(0))
8985 return SDValue();
8986
8987 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8988 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8989 return SDValue();
8990 }
8991
8992 MVT VT = V.getSimpleValueType();
8993 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8994}
8995
8996SDValue
8997X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8998 SDLoc dl(Op);
8999
9000 MVT VT = Op.getSimpleValueType();
9001 MVT EltVT = VT.getVectorElementType();
9002 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9003 unsigned NumElems = Op.getNumOperands();
9004
9005 // Generate vectors for predicate vectors.
9006 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9007 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9008
9009 if (VT.getVectorElementType() == MVT::bf16 &&
9010 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9011 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9012
9013 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9014 return VectorCst;
9015
9016 unsigned EVTBits = EltVT.getSizeInBits();
9017 APInt UndefMask = APInt::getZero(NumElems);
9018 APInt FrozenUndefMask = APInt::getZero(NumElems);
9019 APInt ZeroMask = APInt::getZero(NumElems);
9020 APInt NonZeroMask = APInt::getZero(NumElems);
9021 bool IsAllConstants = true;
9022 bool OneUseFrozenUndefs = true;
9023 SmallSet<SDValue, 8> Values;
9024 unsigned NumConstants = NumElems;
9025 for (unsigned i = 0; i < NumElems; ++i) {
9026 SDValue Elt = Op.getOperand(i);
9027 if (Elt.isUndef()) {
9028 UndefMask.setBit(i);
9029 continue;
9030 }
9031 if (ISD::isFreezeUndef(Elt.getNode())) {
9032 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9033 FrozenUndefMask.setBit(i);
9034 continue;
9035 }
9036 Values.insert(Elt);
9037 if (!isIntOrFPConstant(Elt)) {
9038 IsAllConstants = false;
9039 NumConstants--;
9040 }
9041 if (X86::isZeroNode(Elt)) {
9042 ZeroMask.setBit(i);
9043 } else {
9044 NonZeroMask.setBit(i);
9045 }
9046 }
9047
9048 // All undef vector. Return an UNDEF.
9049 if (UndefMask.isAllOnes())
9050 return DAG.getUNDEF(VT);
9051
9052 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9053 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9054 return DAG.getFreeze(DAG.getUNDEF(VT));
9055
9056 // All undef/freeze(undef)/zero vector. Return a zero vector.
9057 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9058 return getZeroVector(VT, Subtarget, DAG, dl);
9059
9060 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9061 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9062 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9063 // and blend the FREEZE-UNDEF operands back in.
9064 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9065 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9066 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9067 SmallVector<int, 16> BlendMask(NumElems, -1);
9068 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9069 for (unsigned i = 0; i < NumElems; ++i) {
9070 if (UndefMask[i]) {
9071 BlendMask[i] = -1;
9072 continue;
9073 }
9074 BlendMask[i] = i;
9075 if (!FrozenUndefMask[i])
9076 Elts[i] = Op.getOperand(i);
9077 else
9078 BlendMask[i] += NumElems;
9079 }
9080 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9081 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9082 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9083 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9084 }
9085
9086 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9087
9088 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9089 // be better off lowering to a smaller build vector and padding with
9090 // undef/zero.
9091 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9093 unsigned UpperElems = NumElems / 2;
9094 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9095 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9096 if (NumUpperUndefsOrZeros >= UpperElems) {
9097 if (VT.is512BitVector() &&
9098 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9099 UpperElems = NumElems - (NumElems / 4);
9100 // If freeze(undef) is in any upper elements, force to zero.
9101 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9102 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9103 SDValue NewBV =
9104 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9105 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9106 }
9107 }
9108
9109 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9110 return AddSub;
9111 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9112 return HorizontalOp;
9113 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9114 return Broadcast;
9115 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9116 return BitOp;
9117
9118 unsigned NumZero = ZeroMask.popcount();
9119 unsigned NumNonZero = NonZeroMask.popcount();
9120
9121 // If we are inserting one variable into a vector of non-zero constants, try
9122 // to avoid loading each constant element as a scalar. Load the constants as a
9123 // vector and then insert the variable scalar element. If insertion is not
9124 // supported, fall back to a shuffle to get the scalar blended with the
9125 // constants. Insertion into a zero vector is handled as a special-case
9126 // somewhere below here.
9127 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9128 FrozenUndefMask.isZero() &&
9131 // Create an all-constant vector. The variable element in the old
9132 // build vector is replaced by undef in the constant vector. Save the
9133 // variable scalar element and its index for use in the insertelement.
9134 LLVMContext &Context = *DAG.getContext();
9135 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9136 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9137 SDValue VarElt;
9138 SDValue InsIndex;
9139 for (unsigned i = 0; i != NumElems; ++i) {
9140 SDValue Elt = Op.getOperand(i);
9141 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9142 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9143 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9144 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9145 else if (!Elt.isUndef()) {
9146 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9147 "Expected one variable element in this vector");
9148 VarElt = Elt;
9149 InsIndex = DAG.getVectorIdxConstant(i, dl);
9150 }
9151 }
9152 Constant *CV = ConstantVector::get(ConstVecOps);
9153 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9154
9155 // The constants we just created may not be legal (eg, floating point). We
9156 // must lower the vector right here because we can not guarantee that we'll
9157 // legalize it before loading it. This is also why we could not just create
9158 // a new build vector here. If the build vector contains illegal constants,
9159 // it could get split back up into a series of insert elements.
9160 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9161 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9164 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9165 unsigned InsertC = InsIndex->getAsZExtVal();
9166 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9167 if (InsertC < NumEltsInLow128Bits)
9168 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9169
9170 // There's no good way to insert into the high elements of a >128-bit
9171 // vector, so use shuffles to avoid an extract/insert sequence.
9172 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9173 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9174 SmallVector<int, 8> ShuffleMask;
9175 unsigned NumElts = VT.getVectorNumElements();
9176 for (unsigned i = 0; i != NumElts; ++i)
9177 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9178 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9179 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9180 }
9181
9182 // Special case for single non-zero, non-undef, element.
9183 if (NumNonZero == 1) {
9184 unsigned Idx = NonZeroMask.countr_zero();
9185 SDValue Item = Op.getOperand(Idx);
9186
9187 // If we have a constant or non-constant insertion into the low element of
9188 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9189 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9190 // depending on what the source datatype is.
9191 if (Idx == 0) {
9192 if (NumZero == 0)
9193 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9194
9195 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9196 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9197 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9198 assert((VT.is128BitVector() || VT.is256BitVector() ||
9199 VT.is512BitVector()) &&
9200 "Expected an SSE value type!");
9201 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9202 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9203 // zero vector.
9204 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9205 }
9206
9207 // We can't directly insert an i8 or i16 into a vector, so zero extend
9208 // it to i32 first.
9209 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9210 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9211 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9212 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9213 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9214 return DAG.getBitcast(VT, Item);
9215 }
9216 }
9217
9218 // Is it a vector logical left shift?
9219 if (NumElems == 2 && Idx == 1 &&
9220 X86::isZeroNode(Op.getOperand(0)) &&
9221 !X86::isZeroNode(Op.getOperand(1))) {
9222 unsigned NumBits = VT.getSizeInBits();
9223 return getVShift(true, VT,
9225 VT, Op.getOperand(1)),
9226 NumBits/2, DAG, *this, dl);
9227 }
9228
9229 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9230 return SDValue();
9231
9232 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9233 // is a non-constant being inserted into an element other than the low one,
9234 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9235 // movd/movss) to move this into the low element, then shuffle it into
9236 // place.
9237 if (EVTBits == 32) {
9238 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9239 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9240 }
9241 }
9242
9243 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9244 if (Values.size() == 1) {
9245 if (EVTBits == 32) {
9246 // Instead of a shuffle like this:
9247 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9248 // Check if it's possible to issue this instead.
9249 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9250 unsigned Idx = NonZeroMask.countr_zero();
9251 SDValue Item = Op.getOperand(Idx);
9252 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9253 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9254 }
9255 return SDValue();
9256 }
9257
9258 // A vector full of immediates; various special cases are already
9259 // handled, so this is best done with a single constant-pool load.
9260 if (IsAllConstants)
9261 return SDValue();
9262
9263 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9264 return V;
9265
9266 // See if we can use a vector load to get all of the elements.
9267 {
9268 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9269 if (SDValue LD =
9270 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9271 return LD;
9272 }
9273
9274 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9275 // build_vector and broadcast it.
9276 // TODO: We could probably generalize this more.
9277 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9278 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9279 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9280 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9281 // Make sure all the even/odd operands match.
9282 for (unsigned i = 2; i != NumElems; ++i)
9283 if (Ops[i % 2] != Op.getOperand(i))
9284 return false;
9285 return true;
9286 };
9287 if (CanSplat(Op, NumElems, Ops)) {
9288 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9289 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9290 // Create a new build vector and cast to v2i64/v2f64.
9291 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9292 DAG.getBuildVector(NarrowVT, dl, Ops));
9293 // Broadcast from v2i64/v2f64 and cast to final VT.
9294 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9295 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9296 NewBV));
9297 }
9298 }
9299
9300 // For AVX-length vectors, build the individual 128-bit pieces and use
9301 // shuffles to put them in place.
9302 if (VT.getSizeInBits() > 128) {
9303 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9304
9305 // Build both the lower and upper subvector.
9306 SDValue Lower =
9307 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9309 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9310
9311 // Recreate the wider vector with the lower and upper part.
9312 return concatSubVectors(Lower, Upper, DAG, dl);
9313 }
9314
9315 // Let legalizer expand 2-wide build_vectors.
9316 if (EVTBits == 64) {
9317 if (NumNonZero == 1) {
9318 // One half is zero or undef.
9319 unsigned Idx = NonZeroMask.countr_zero();
9321 Op.getOperand(Idx));
9322 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9323 }
9324 return SDValue();
9325 }
9326
9327 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9328 if (EVTBits == 8 && NumElems == 16)
9329 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9330 NumZero, DAG, Subtarget))
9331 return V;
9332
9333 if (EltVT == MVT::i16 && NumElems == 8)
9334 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9335 NumZero, DAG, Subtarget))
9336 return V;
9337
9338 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9339 if (EVTBits == 32 && NumElems == 4)
9340 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9341 return V;
9342
9343 // If element VT is == 32 bits, turn it into a number of shuffles.
9344 if (NumElems == 4 && NumZero > 0) {
9345 SmallVector<SDValue, 8> Ops(NumElems);
9346 for (unsigned i = 0; i < 4; ++i) {
9347 bool isZero = !NonZeroMask[i];
9348 if (isZero)
9349 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9350 else
9351 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9352 }
9353
9354 for (unsigned i = 0; i < 2; ++i) {
9355 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9356 default: llvm_unreachable("Unexpected NonZero count");
9357 case 0:
9358 Ops[i] = Ops[i*2]; // Must be a zero vector.
9359 break;
9360 case 1:
9361 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9362 break;
9363 case 2:
9364 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9365 break;
9366 case 3:
9367 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9368 break;
9369 }
9370 }
9371
9372 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9373 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9374 int MaskVec[] = {
9375 Reverse1 ? 1 : 0,
9376 Reverse1 ? 0 : 1,
9377 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9378 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9379 };
9380 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9381 }
9382
9383 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9384
9385 // Check for a build vector from mostly shuffle plus few inserting.
9386 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9387 return Sh;
9388
9389 // For SSE 4.1, use insertps to put the high elements into the low element.
9390 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9392 if (!Op.getOperand(0).isUndef())
9393 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9394 else
9395 Result = DAG.getUNDEF(VT);
9396
9397 for (unsigned i = 1; i < NumElems; ++i) {
9398 if (Op.getOperand(i).isUndef()) continue;
9399 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9400 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9401 }
9402 return Result;
9403 }
9404
9405 // Otherwise, expand into a number of unpckl*, start by extending each of
9406 // our (non-undef) elements to the full vector width with the element in the
9407 // bottom slot of the vector (which generates no code for SSE).
9408 SmallVector<SDValue, 8> Ops(NumElems);
9409 for (unsigned i = 0; i < NumElems; ++i) {
9410 if (!Op.getOperand(i).isUndef())
9411 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9412 else
9413 Ops[i] = DAG.getUNDEF(VT);
9414 }
9415
9416 // Next, we iteratively mix elements, e.g. for v4f32:
9417 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9418 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9419 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9420 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9421 // Generate scaled UNPCKL shuffle mask.
9423 for(unsigned i = 0; i != Scale; ++i)
9424 Mask.push_back(i);
9425 for (unsigned i = 0; i != Scale; ++i)
9426 Mask.push_back(NumElems+i);
9427 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9428
9429 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9430 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9431 }
9432 return Ops[0];
9433}
9434
9435// 256-bit AVX can use the vinsertf128 instruction
9436// to create 256-bit vectors from two other 128-bit ones.
9437// TODO: Detect subvector broadcast here instead of DAG combine?
9439 const X86Subtarget &Subtarget) {
9440 SDLoc dl(Op);
9441 MVT ResVT = Op.getSimpleValueType();
9442
9443 assert((ResVT.is256BitVector() ||
9444 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9445
9446 unsigned NumOperands = Op.getNumOperands();
9447 unsigned NumFreezeUndef = 0;
9448 unsigned NumZero = 0;
9449 unsigned NumNonZero = 0;
9450 unsigned NonZeros = 0;
9451 for (unsigned i = 0; i != NumOperands; ++i) {
9452 SDValue SubVec = Op.getOperand(i);
9453 if (SubVec.isUndef())
9454 continue;
9455 if (ISD::isFreezeUndef(SubVec.getNode())) {
9456 // If the freeze(undef) has multiple uses then we must fold to zero.
9457 if (SubVec.hasOneUse())
9458 ++NumFreezeUndef;
9459 else
9460 ++NumZero;
9461 }
9462 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9463 ++NumZero;
9464 else {
9465 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9466 NonZeros |= 1 << i;
9467 ++NumNonZero;
9468 }
9469 }
9470
9471 // If we have more than 2 non-zeros, build each half separately.
9472 if (NumNonZero > 2) {
9473 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9474 ArrayRef<SDUse> Ops = Op->ops();
9475 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9476 Ops.slice(0, NumOperands/2));
9477 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9478 Ops.slice(NumOperands/2));
9479 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9480 }
9481
9482 // Otherwise, build it up through insert_subvectors.
9483 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9484 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9485 : DAG.getUNDEF(ResVT));
9486
9487 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9488 unsigned NumSubElems = SubVT.getVectorNumElements();
9489 for (unsigned i = 0; i != NumOperands; ++i) {
9490 if ((NonZeros & (1 << i)) == 0)
9491 continue;
9492
9493 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9494 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9495 }
9496
9497 return Vec;
9498}
9499
9500// Returns true if the given node is a type promotion (by concatenating i1
9501// zeros) of the result of a node that already zeros all upper bits of
9502// k-register.
9503// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9505 const X86Subtarget &Subtarget,
9506 SelectionDAG & DAG) {
9507 SDLoc dl(Op);
9508 MVT ResVT = Op.getSimpleValueType();
9509 unsigned NumOperands = Op.getNumOperands();
9510
9511 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9512 "Unexpected number of operands in CONCAT_VECTORS");
9513
9514 uint64_t Zeros = 0;
9515 uint64_t NonZeros = 0;
9516 for (unsigned i = 0; i != NumOperands; ++i) {
9517 SDValue SubVec = Op.getOperand(i);
9518 if (SubVec.isUndef())
9519 continue;
9520 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9521 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9522 Zeros |= (uint64_t)1 << i;
9523 else
9524 NonZeros |= (uint64_t)1 << i;
9525 }
9526
9527 unsigned NumElems = ResVT.getVectorNumElements();
9528
9529 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9530 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9531 // insert_subvector will give us two kshifts.
9532 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9533 Log2_64(NonZeros) != NumOperands - 1) {
9534 unsigned Idx = Log2_64(NonZeros);
9535 SDValue SubVec = Op.getOperand(Idx);
9536 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9537 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9538 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9539 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9540 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9541 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9542 DAG.getVectorIdxConstant(0, dl));
9543 }
9544
9545 // If there are zero or one non-zeros we can handle this very simply.
9546 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9547 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9548 if (!NonZeros)
9549 return Vec;
9550 unsigned Idx = Log2_64(NonZeros);
9551 SDValue SubVec = Op.getOperand(Idx);
9552 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9553 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9554 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9555 }
9556
9557 if (NumOperands > 2) {
9558 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9559 ArrayRef<SDUse> Ops = Op->ops();
9560 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9561 Ops.slice(0, NumOperands / 2));
9562 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9563 Ops.slice(NumOperands / 2));
9564 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9565 }
9566
9567 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9568
9569 if (ResVT.getVectorNumElements() >= 16)
9570 return Op; // The operation is legal with KUNPCK
9571
9572 SDValue Vec =
9573 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9574 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9575 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9576 DAG.getVectorIdxConstant(NumElems / 2, dl));
9577}
9578
9580 const X86Subtarget &Subtarget,
9581 SelectionDAG &DAG) {
9582 MVT VT = Op.getSimpleValueType();
9583 if (VT.getVectorElementType() == MVT::i1)
9584 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9585
9586 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9587 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9588 Op.getNumOperands() == 4)));
9589
9590 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9591 // from two other 128-bit ones.
9592
9593 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9594 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9595}
9596
9597//===----------------------------------------------------------------------===//
9598// Vector shuffle lowering
9599//
9600// This is an experimental code path for lowering vector shuffles on x86. It is
9601// designed to handle arbitrary vector shuffles and blends, gracefully
9602// degrading performance as necessary. It works hard to recognize idiomatic
9603// shuffles and lower them to optimal instruction patterns without leaving
9604// a framework that allows reasonably efficient handling of all vector shuffle
9605// patterns.
9606//===----------------------------------------------------------------------===//
9607
9608/// Tiny helper function to identify a no-op mask.
9609///
9610/// This is a somewhat boring predicate function. It checks whether the mask
9611/// array input, which is assumed to be a single-input shuffle mask of the kind
9612/// used by the X86 shuffle instructions (not a fully general
9613/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9614/// in-place shuffle are 'no-op's.
9616 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9617 assert(Mask[i] >= -1 && "Out of bound mask element!");
9618 if (Mask[i] >= 0 && Mask[i] != i)
9619 return false;
9620 }
9621 return true;
9622}
9623
9624/// Test whether there are elements crossing LaneSizeInBits lanes in this
9625/// shuffle mask.
9626///
9627/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9628/// and we routinely test for these.
9629static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9630 unsigned ScalarSizeInBits,
9631 ArrayRef<int> Mask) {
9632 assert(LaneSizeInBits && ScalarSizeInBits &&
9633 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9634 "Illegal shuffle lane size");
9635 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9636 int Size = Mask.size();
9637 for (int i = 0; i < Size; ++i)
9638 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9639 return true;
9640 return false;
9641}
9642
9643/// Test whether there are elements crossing 128-bit lanes in this
9644/// shuffle mask.
9646 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9647}
9648
9649/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9650/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9651/// better support 'repeated mask + lane permute' style shuffles.
9652static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9653 unsigned ScalarSizeInBits,
9654 ArrayRef<int> Mask) {
9655 assert(LaneSizeInBits && ScalarSizeInBits &&
9656 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9657 "Illegal shuffle lane size");
9658 int NumElts = Mask.size();
9659 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9660 int NumLanes = NumElts / NumEltsPerLane;
9661 if (NumLanes > 1) {
9662 for (int i = 0; i != NumLanes; ++i) {
9663 int SrcLane = -1;
9664 for (int j = 0; j != NumEltsPerLane; ++j) {
9665 int M = Mask[(i * NumEltsPerLane) + j];
9666 if (M < 0)
9667 continue;
9668 int Lane = (M % NumElts) / NumEltsPerLane;
9669 if (SrcLane >= 0 && SrcLane != Lane)
9670 return true;
9671 SrcLane = Lane;
9672 }
9673 }
9674 }
9675 return false;
9676}
9677
9678/// Test whether a shuffle mask is equivalent within each sub-lane.
9679///
9680/// This checks a shuffle mask to see if it is performing the same
9681/// lane-relative shuffle in each sub-lane. This trivially implies
9682/// that it is also not lane-crossing. It may however involve a blend from the
9683/// same lane of a second vector.
9684///
9685/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9686/// non-trivial to compute in the face of undef lanes. The representation is
9687/// suitable for use with existing 128-bit shuffles as entries from the second
9688/// vector have been remapped to [LaneSize, 2*LaneSize).
9689static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9690 ArrayRef<int> Mask,
9691 SmallVectorImpl<int> &RepeatedMask) {
9692 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9693 RepeatedMask.assign(LaneSize, -1);
9694 int Size = Mask.size();
9695 for (int i = 0; i < Size; ++i) {
9696 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9697 if (Mask[i] < 0)
9698 continue;
9699 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9700 // This entry crosses lanes, so there is no way to model this shuffle.
9701 return false;
9702
9703 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9704 // Adjust second vector indices to start at LaneSize instead of Size.
9705 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9706 : Mask[i] % LaneSize + LaneSize;
9707 if (RepeatedMask[i % LaneSize] < 0)
9708 // This is the first non-undef entry in this slot of a 128-bit lane.
9709 RepeatedMask[i % LaneSize] = LocalM;
9710 else if (RepeatedMask[i % LaneSize] != LocalM)
9711 // Found a mismatch with the repeated mask.
9712 return false;
9713 }
9714 return true;
9715}
9716
9717/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9718static bool
9720 SmallVectorImpl<int> &RepeatedMask) {
9721 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9722}
9723
9724static bool
9726 SmallVector<int, 32> RepeatedMask;
9727 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9728}
9729
9730/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9731static bool
9733 SmallVectorImpl<int> &RepeatedMask) {
9734 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9735}
9736
9737/// Test whether a target shuffle mask is equivalent within each sub-lane.
9738/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9739static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9740 unsigned EltSizeInBits,
9741 ArrayRef<int> Mask,
9742 SmallVectorImpl<int> &RepeatedMask) {
9743 int LaneSize = LaneSizeInBits / EltSizeInBits;
9744 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9745 int Size = Mask.size();
9746 for (int i = 0; i < Size; ++i) {
9747 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9748 if (Mask[i] == SM_SentinelUndef)
9749 continue;
9750 if (Mask[i] == SM_SentinelZero) {
9751 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9752 return false;
9753 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9754 continue;
9755 }
9756 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9757 // This entry crosses lanes, so there is no way to model this shuffle.
9758 return false;
9759
9760 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9761 // later vector indices to start at multiples of LaneSize instead of Size.
9762 int LaneM = Mask[i] / Size;
9763 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9764 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9765 // This is the first non-undef entry in this slot of a 128-bit lane.
9766 RepeatedMask[i % LaneSize] = LocalM;
9767 else if (RepeatedMask[i % LaneSize] != LocalM)
9768 // Found a mismatch with the repeated mask.
9769 return false;
9770 }
9771 return true;
9772}
9773
9774/// Test whether a target shuffle mask is equivalent within each sub-lane.
9775/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9776static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9777 ArrayRef<int> Mask,
9778 SmallVectorImpl<int> &RepeatedMask) {
9779 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9780 Mask, RepeatedMask);
9781}
9782
9783/// Checks whether the vector elements referenced by two shuffle masks are
9784/// equivalent.
9785static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9786 int Idx, int ExpectedIdx) {
9787 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9788 ExpectedIdx < MaskSize && "Out of range element index");
9789 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9790 return false;
9791
9792 switch (Op.getOpcode()) {
9793 case ISD::BUILD_VECTOR:
9794 // If the values are build vectors, we can look through them to find
9795 // equivalent inputs that make the shuffles equivalent.
9796 // TODO: Handle MaskSize != Op.getNumOperands()?
9797 if (MaskSize == (int)Op.getNumOperands() &&
9798 MaskSize == (int)ExpectedOp.getNumOperands())
9799 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9800 break;
9801 case X86ISD::VBROADCAST:
9803 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9804 return (Op == ExpectedOp &&
9805 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9806 case X86ISD::HADD:
9807 case X86ISD::HSUB:
9808 case X86ISD::FHADD:
9809 case X86ISD::FHSUB:
9810 case X86ISD::PACKSS:
9811 case X86ISD::PACKUS:
9812 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9813 // TODO: Handle MaskSize != NumElts?
9814 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9815 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9816 MVT VT = Op.getSimpleValueType();
9817 int NumElts = VT.getVectorNumElements();
9818 if (MaskSize == NumElts) {
9819 int NumLanes = VT.getSizeInBits() / 128;
9820 int NumEltsPerLane = NumElts / NumLanes;
9821 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9822 bool SameLane =
9823 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9824 bool SameElt =
9825 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9826 return SameLane && SameElt;
9827 }
9828 }
9829 break;
9830 }
9831
9832 return false;
9833}
9834
9835/// Checks whether a shuffle mask is equivalent to an explicit list of
9836/// arguments.
9837///
9838/// This is a fast way to test a shuffle mask against a fixed pattern:
9839///
9840/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9841///
9842/// It returns true if the mask is exactly as wide as the argument list, and
9843/// each element of the mask is either -1 (signifying undef) or the value given
9844/// in the argument.
9845static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9846 SDValue V1 = SDValue(),
9847 SDValue V2 = SDValue()) {
9848 int Size = Mask.size();
9849 if (Size != (int)ExpectedMask.size())
9850 return false;
9851
9852 for (int i = 0; i < Size; ++i) {
9853 assert(Mask[i] >= -1 && "Out of bound mask element!");
9854 int MaskIdx = Mask[i];
9855 int ExpectedIdx = ExpectedMask[i];
9856 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9857 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9858 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9859 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9860 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9861 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9862 return false;
9863 }
9864 }
9865 return true;
9866}
9867
9868/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9869///
9870/// The masks must be exactly the same width.
9871///
9872/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9873/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9874///
9875/// SM_SentinelZero is accepted as a valid negative index but must match in
9876/// both, or via a known bits test.
9878 ArrayRef<int> ExpectedMask,
9879 const SelectionDAG &DAG,
9880 SDValue V1 = SDValue(),
9881 SDValue V2 = SDValue()) {
9882 int Size = Mask.size();
9883 if (Size != (int)ExpectedMask.size())
9884 return false;
9885 assert(llvm::all_of(ExpectedMask,
9886 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9887 "Illegal target shuffle mask");
9888
9889 // Check for out-of-range target shuffle mask indices.
9890 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9891 return false;
9892
9893 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9894 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9895 !V1.getValueType().isVector()))
9896 V1 = SDValue();
9897 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9898 !V2.getValueType().isVector()))
9899 V2 = SDValue();
9900
9901 APInt ZeroV1 = APInt::getZero(Size);
9902 APInt ZeroV2 = APInt::getZero(Size);
9903
9904 for (int i = 0; i < Size; ++i) {
9905 int MaskIdx = Mask[i];
9906 int ExpectedIdx = ExpectedMask[i];
9907 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9908 continue;
9909 if (MaskIdx == SM_SentinelZero) {
9910 // If we need this expected index to be a zero element, then update the
9911 // relevant zero mask and perform the known bits at the end to minimize
9912 // repeated computes.
9913 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9914 if (ExpectedV &&
9915 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9916 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9917 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9918 ZeroMask.setBit(BitIdx);
9919 continue;
9920 }
9921 }
9922 if (MaskIdx >= 0) {
9923 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9924 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9925 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9926 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9927 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9928 continue;
9929 }
9930 return false;
9931 }
9932 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9933 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9934}
9935
9936// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9937// instructions.
9939 const SelectionDAG &DAG) {
9940 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9941 return false;
9942
9943 SmallVector<int, 8> Unpcklwd;
9944 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9945 /* Unary = */ false);
9946 SmallVector<int, 8> Unpckhwd;
9947 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9948 /* Unary = */ false);
9949 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9950 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9951 return IsUnpackwdMask;
9952}
9953
9955 const SelectionDAG &DAG) {
9956 // Create 128-bit vector type based on mask size.
9957 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9958 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9959
9960 // We can't assume a canonical shuffle mask, so try the commuted version too.
9961 SmallVector<int, 4> CommutedMask(Mask);
9963
9964 // Match any of unary/binary or low/high.
9965 for (unsigned i = 0; i != 4; ++i) {
9966 SmallVector<int, 16> UnpackMask;
9967 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9968 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9969 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9970 return true;
9971 }
9972 return false;
9973}
9974
9975/// Return true if a shuffle mask chooses elements identically in its top and
9976/// bottom halves. For example, any splat mask has the same top and bottom
9977/// halves. If an element is undefined in only one half of the mask, the halves
9978/// are not considered identical.
9980 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9981 unsigned HalfSize = Mask.size() / 2;
9982 for (unsigned i = 0; i != HalfSize; ++i) {
9983 if (Mask[i] != Mask[i + HalfSize])
9984 return false;
9985 }
9986 return true;
9987}
9988
9989/// Get a 4-lane 8-bit shuffle immediate for a mask.
9990///
9991/// This helper function produces an 8-bit shuffle immediate corresponding to
9992/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9993/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9994/// example.
9995///
9996/// NB: We rely heavily on "undef" masks preserving the input lane.
9997static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9998 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9999 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10000 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10001 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10002 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10003
10004 // If the mask only uses one non-undef element, then fully 'splat' it to
10005 // improve later broadcast matching.
10006 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10007 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10008
10009 int FirstElt = Mask[FirstIndex];
10010 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10011 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10012
10013 unsigned Imm = 0;
10014 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10015 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10016 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10017 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10018 return Imm;
10019}
10020
10022 SelectionDAG &DAG) {
10023 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10024}
10025
10026// Canonicalize SHUFPD mask to improve chances of further folding.
10027// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10028static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10029 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10030 "Unexpected SHUFPD mask size");
10031 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10032 "Unexpected SHUFPD mask elements");
10033
10034 // If the mask only uses one non-undef element, then fully 'splat' it to
10035 // improve later broadcast matching.
10036 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10037 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10038 "All undef shuffle mask");
10039
10040 int FirstElt = Mask[FirstIndex];
10041 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10042 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10043 unsigned Imm = 0;
10044 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10045 Imm |= FirstElt << I;
10046 return Imm;
10047 }
10048
10049 // Attempt to keep any undef elements in place to improve chances of the
10050 // shuffle becoming a (commutative) blend.
10051 unsigned Imm = 0;
10052 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10053 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10054
10055 return Imm;
10056}
10057
10059 SelectionDAG &DAG) {
10060 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10061}
10062
10063// The Shuffle result is as follow:
10064// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10065// Each Zeroable's element correspond to a particular Mask's element.
10066// As described in computeZeroableShuffleElements function.
10067//
10068// The function looks for a sub-mask that the nonzero elements are in
10069// increasing order. If such sub-mask exist. The function returns true.
10070static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10071 ArrayRef<int> Mask, const EVT &VectorType,
10072 bool &IsZeroSideLeft) {
10073 int NextElement = -1;
10074 // Check if the Mask's nonzero elements are in increasing order.
10075 for (int i = 0, e = Mask.size(); i < e; i++) {
10076 // Checks if the mask's zeros elements are built from only zeros.
10077 assert(Mask[i] >= -1 && "Out of bound mask element!");
10078 if (Mask[i] < 0)
10079 return false;
10080 if (Zeroable[i])
10081 continue;
10082 // Find the lowest non zero element
10083 if (NextElement < 0) {
10084 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10085 IsZeroSideLeft = NextElement != 0;
10086 }
10087 // Exit if the mask's non zero elements are not in increasing order.
10088 if (NextElement != Mask[i])
10089 return false;
10090 NextElement++;
10091 }
10092 return true;
10093}
10094
10095/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10097 ArrayRef<int> Mask, SDValue V1,
10098 SDValue V2, const APInt &Zeroable,
10099 const X86Subtarget &Subtarget,
10100 SelectionDAG &DAG) {
10101 int Size = Mask.size();
10102 int LaneSize = 128 / VT.getScalarSizeInBits();
10103 const int NumBytes = VT.getSizeInBits() / 8;
10104 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10105
10106 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10107 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10108 (Subtarget.hasBWI() && VT.is512BitVector()));
10109
10110 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10111 // Sign bit set in i8 mask means zero element.
10112 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10113
10114 SDValue V;
10115 for (int i = 0; i < NumBytes; ++i) {
10116 int M = Mask[i / NumEltBytes];
10117 if (M < 0) {
10118 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10119 continue;
10120 }
10121 if (Zeroable[i / NumEltBytes]) {
10122 PSHUFBMask[i] = ZeroMask;
10123 continue;
10124 }
10125
10126 // We can only use a single input of V1 or V2.
10127 SDValue SrcV = (M >= Size ? V2 : V1);
10128 if (V && V != SrcV)
10129 return SDValue();
10130 V = SrcV;
10131 M %= Size;
10132
10133 // PSHUFB can't cross lanes, ensure this doesn't happen.
10134 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10135 return SDValue();
10136
10137 M = M % LaneSize;
10138 M = M * NumEltBytes + (i % NumEltBytes);
10139 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10140 }
10141 assert(V && "Failed to find a source input");
10142
10143 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10144 return DAG.getBitcast(
10145 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10146 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10147}
10148
10149static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10150 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10151 const SDLoc &dl);
10152
10153// X86 has dedicated shuffle that can be lowered to VEXPAND
10155 SDValue V2, ArrayRef<int> Mask,
10156 const APInt &Zeroable,
10157 const X86Subtarget &Subtarget,
10158 SelectionDAG &DAG) {
10159 bool IsLeftZeroSide = true;
10160 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10161 IsLeftZeroSide))
10162 return SDValue();
10163 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10165 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10166 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10167 unsigned NumElts = VT.getVectorNumElements();
10168 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10169 "Unexpected number of vector elements");
10170 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10171 Subtarget, DAG, DL);
10172 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10173 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10174 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10175}
10176
10177static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10178 unsigned &UnpackOpcode, bool IsUnary,
10179 ArrayRef<int> TargetMask, const SDLoc &DL,
10180 SelectionDAG &DAG,
10181 const X86Subtarget &Subtarget) {
10182 int NumElts = VT.getVectorNumElements();
10183
10184 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10185 for (int i = 0; i != NumElts; i += 2) {
10186 int M1 = TargetMask[i + 0];
10187 int M2 = TargetMask[i + 1];
10188 Undef1 &= (SM_SentinelUndef == M1);
10189 Undef2 &= (SM_SentinelUndef == M2);
10190 Zero1 &= isUndefOrZero(M1);
10191 Zero2 &= isUndefOrZero(M2);
10192 }
10193 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10194 "Zeroable shuffle detected");
10195
10196 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10197 SmallVector<int, 64> Unpckl, Unpckh;
10198 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10199 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10200 (IsUnary ? V1 : V2))) {
10201 UnpackOpcode = X86ISD::UNPCKL;
10202 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10203 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10204 return true;
10205 }
10206
10207 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10208 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10209 (IsUnary ? V1 : V2))) {
10210 UnpackOpcode = X86ISD::UNPCKH;
10211 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10212 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10213 return true;
10214 }
10215
10216 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10217 if (IsUnary && (Zero1 || Zero2)) {
10218 // Don't bother if we can blend instead.
10219 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10220 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10221 return false;
10222
10223 bool MatchLo = true, MatchHi = true;
10224 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10225 int M = TargetMask[i];
10226
10227 // Ignore if the input is known to be zero or the index is undef.
10228 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10229 (M == SM_SentinelUndef))
10230 continue;
10231
10232 MatchLo &= (M == Unpckl[i]);
10233 MatchHi &= (M == Unpckh[i]);
10234 }
10235
10236 if (MatchLo || MatchHi) {
10237 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10238 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10239 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10240 return true;
10241 }
10242 }
10243
10244 // If a binary shuffle, commute and try again.
10245 if (!IsUnary) {
10247 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10248 UnpackOpcode = X86ISD::UNPCKL;
10249 std::swap(V1, V2);
10250 return true;
10251 }
10252
10254 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10255 UnpackOpcode = X86ISD::UNPCKH;
10256 std::swap(V1, V2);
10257 return true;
10258 }
10259 }
10260
10261 return false;
10262}
10263
10264// X86 has dedicated unpack instructions that can handle specific blend
10265// operations: UNPCKH and UNPCKL.
10267 SDValue V2, ArrayRef<int> Mask,
10268 SelectionDAG &DAG) {
10269 SmallVector<int, 8> Unpckl;
10270 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10271 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10272 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10273
10274 SmallVector<int, 8> Unpckh;
10275 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10276 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10277 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10278
10279 // Commute and try again.
10281 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10282 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10283
10285 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10286 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10287
10288 return SDValue();
10289}
10290
10291/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10292/// followed by unpack 256-bit.
10294 SDValue V2, ArrayRef<int> Mask,
10295 SelectionDAG &DAG) {
10296 SmallVector<int, 32> Unpckl, Unpckh;
10297 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10298 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10299
10300 unsigned UnpackOpcode;
10301 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10302 UnpackOpcode = X86ISD::UNPCKL;
10303 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10304 UnpackOpcode = X86ISD::UNPCKH;
10305 else
10306 return SDValue();
10307
10308 // This is a "natural" unpack operation (rather than the 128-bit sectored
10309 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10310 // input in order to use the x86 instruction.
10311 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10312 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10313 V1 = DAG.getBitcast(VT, V1);
10314 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10315}
10316
10317// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10318// source into the lower elements and zeroing the upper elements.
10319static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10320 ArrayRef<int> Mask, const APInt &Zeroable,
10321 const X86Subtarget &Subtarget) {
10322 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10323 return false;
10324
10325 unsigned NumElts = Mask.size();
10326 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10327 unsigned MaxScale = 64 / EltSizeInBits;
10328
10329 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10330 unsigned SrcEltBits = EltSizeInBits * Scale;
10331 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10332 continue;
10333 unsigned NumSrcElts = NumElts / Scale;
10334 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10335 continue;
10336 unsigned UpperElts = NumElts - NumSrcElts;
10337 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10338 continue;
10339 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10340 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10341 DstVT = MVT::getIntegerVT(EltSizeInBits);
10342 if ((NumSrcElts * EltSizeInBits) >= 128) {
10343 // ISD::TRUNCATE
10344 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10345 } else {
10346 // X86ISD::VTRUNC
10347 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10348 }
10349 return true;
10350 }
10351
10352 return false;
10353}
10354
10355// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10356// element padding to the final DstVT.
10357static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10358 const X86Subtarget &Subtarget,
10359 SelectionDAG &DAG, bool ZeroUppers) {
10360 MVT SrcVT = Src.getSimpleValueType();
10361 MVT DstSVT = DstVT.getScalarType();
10362 unsigned NumDstElts = DstVT.getVectorNumElements();
10363 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10364 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10365
10366 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10367 return SDValue();
10368
10369 // Perform a direct ISD::TRUNCATE if possible.
10370 if (NumSrcElts == NumDstElts)
10371 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10372
10373 if (NumSrcElts > NumDstElts) {
10374 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10375 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10376 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10377 }
10378
10379 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10380 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10381 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10382 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10383 DstVT.getSizeInBits());
10384 }
10385
10386 // Non-VLX targets must truncate from a 512-bit type, so we need to
10387 // widen, truncate and then possibly extract the original subvector.
10388 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10389 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10390 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10391 }
10392
10393 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10394 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10395 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10396 if (DstVT != TruncVT)
10397 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10398 DstVT.getSizeInBits());
10399 return Trunc;
10400}
10401
10402// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10403//
10404// An example is the following:
10405//
10406// t0: ch = EntryToken
10407// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10408// t25: v4i32 = truncate t2
10409// t41: v8i16 = bitcast t25
10410// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10411// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10412// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10413// t18: v2i64 = bitcast t51
10414//
10415// One can just use a single vpmovdw instruction, without avx512vl we need to
10416// use the zmm variant and extract the lower subvector, padding with zeroes.
10417// TODO: Merge with lowerShuffleAsVTRUNC.
10419 SDValue V2, ArrayRef<int> Mask,
10420 const APInt &Zeroable,
10421 const X86Subtarget &Subtarget,
10422 SelectionDAG &DAG) {
10423 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10424 if (!Subtarget.hasAVX512())
10425 return SDValue();
10426
10427 unsigned NumElts = VT.getVectorNumElements();
10428 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10429 unsigned MaxScale = 64 / EltSizeInBits;
10430 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10431 unsigned SrcEltBits = EltSizeInBits * Scale;
10432 unsigned NumSrcElts = NumElts / Scale;
10433 unsigned UpperElts = NumElts - NumSrcElts;
10434 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10435 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10436 continue;
10437
10438 // Attempt to find a matching source truncation, but as a fall back VLX
10439 // cases can use the VPMOV directly.
10440 SDValue Src = peekThroughBitcasts(V1);
10441 if (Src.getOpcode() == ISD::TRUNCATE &&
10442 Src.getScalarValueSizeInBits() == SrcEltBits) {
10443 Src = Src.getOperand(0);
10444 } else if (Subtarget.hasVLX()) {
10445 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10446 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10447 Src = DAG.getBitcast(SrcVT, Src);
10448 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10449 if (Scale == 2 &&
10450 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10451 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10452 return SDValue();
10453 } else
10454 return SDValue();
10455
10456 // VPMOVWB is only available with avx512bw.
10457 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10458 return SDValue();
10459
10460 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10461 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10462 }
10463
10464 return SDValue();
10465}
10466
10467// Attempt to match binary shuffle patterns as a truncate.
10469 SDValue V2, ArrayRef<int> Mask,
10470 const APInt &Zeroable,
10471 const X86Subtarget &Subtarget,
10472 SelectionDAG &DAG) {
10473 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10474 "Unexpected VTRUNC type");
10475 if (!Subtarget.hasAVX512())
10476 return SDValue();
10477
10478 unsigned NumElts = VT.getVectorNumElements();
10479 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10480 unsigned MaxScale = 64 / EltSizeInBits;
10481 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10482 // TODO: Support non-BWI VPMOVWB truncations?
10483 unsigned SrcEltBits = EltSizeInBits * Scale;
10484 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10485 continue;
10486
10487 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10488 // Bail if the V2 elements are undef.
10489 unsigned NumHalfSrcElts = NumElts / Scale;
10490 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10491 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10492 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10493 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10494 continue;
10495
10496 // The elements beyond the truncation must be undef/zero.
10497 unsigned UpperElts = NumElts - NumSrcElts;
10498 if (UpperElts > 0 &&
10499 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10500 continue;
10501 bool UndefUppers =
10502 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10503
10504 // For offset truncations, ensure that the concat is cheap.
10505 if (Offset) {
10506 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10507 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10508 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10509 return Lo.getOperand(0) == Hi.getOperand(0);
10510 if (ISD::isNormalLoad(Lo.getNode()) &&
10511 ISD::isNormalLoad(Hi.getNode())) {
10512 auto *LDLo = cast<LoadSDNode>(Lo);
10513 auto *LDHi = cast<LoadSDNode>(Hi);
10515 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10516 }
10517 return false;
10518 };
10519 if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
10520 continue;
10521 }
10522
10523 // As we're using both sources then we need to concat them together
10524 // and truncate from the double-sized src.
10525 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10526 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10527
10528 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10529 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10530 Src = DAG.getBitcast(SrcVT, Src);
10531
10532 // Shift the offset'd elements into place for the truncation.
10533 // TODO: Use getTargetVShiftByConstNode.
10534 if (Offset)
10535 Src = DAG.getNode(
10536 X86ISD::VSRLI, DL, SrcVT, Src,
10537 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10538
10539 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10540 }
10541 }
10542
10543 return SDValue();
10544}
10545
10546/// Check whether a compaction lowering can be done by dropping even/odd
10547/// elements and compute how many times even/odd elements must be dropped.
10548///
10549/// This handles shuffles which take every Nth element where N is a power of
10550/// two. Example shuffle masks:
10551///
10552/// (even)
10553/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10554/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10555/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10556/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10557/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10558/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10559///
10560/// (odd)
10561/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10562/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10563///
10564/// Any of these lanes can of course be undef.
10565///
10566/// This routine only supports N <= 3.
10567/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10568/// for larger N.
10569///
10570/// \returns N above, or the number of times even/odd elements must be dropped
10571/// if there is such a number. Otherwise returns zero.
10572static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10573 bool IsSingleInput) {
10574 // The modulus for the shuffle vector entries is based on whether this is
10575 // a single input or not.
10576 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10577 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10578 "We should only be called with masks with a power-of-2 size!");
10579
10580 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10581 int Offset = MatchEven ? 0 : 1;
10582
10583 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10584 // and 2^3 simultaneously. This is because we may have ambiguity with
10585 // partially undef inputs.
10586 bool ViableForN[3] = {true, true, true};
10587
10588 for (int i = 0, e = Mask.size(); i < e; ++i) {
10589 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10590 // want.
10591 if (Mask[i] < 0)
10592 continue;
10593
10594 bool IsAnyViable = false;
10595 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10596 if (ViableForN[j]) {
10597 uint64_t N = j + 1;
10598
10599 // The shuffle mask must be equal to (i * 2^N) % M.
10600 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10601 IsAnyViable = true;
10602 else
10603 ViableForN[j] = false;
10604 }
10605 // Early exit if we exhaust the possible powers of two.
10606 if (!IsAnyViable)
10607 break;
10608 }
10609
10610 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10611 if (ViableForN[j])
10612 return j + 1;
10613
10614 // Return 0 as there is no viable power of two.
10615 return 0;
10616}
10617
10618// X86 has dedicated pack instructions that can handle specific truncation
10619// operations: PACKSS and PACKUS.
10620// Checks for compaction shuffle masks if MaxStages > 1.
10621// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10622static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10623 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10624 const SelectionDAG &DAG,
10625 const X86Subtarget &Subtarget,
10626 unsigned MaxStages = 1) {
10627 unsigned NumElts = VT.getVectorNumElements();
10628 unsigned BitSize = VT.getScalarSizeInBits();
10629 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10630 "Illegal maximum compaction");
10631
10632 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10633 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10634 unsigned NumPackedBits = NumSrcBits - BitSize;
10635 N1 = peekThroughBitcasts(N1);
10636 N2 = peekThroughBitcasts(N2);
10637 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10638 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10639 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10640 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10641 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10642 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10643 return false;
10644 if (Subtarget.hasSSE41() || BitSize == 8) {
10645 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10646 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10647 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10648 V1 = N1;
10649 V2 = N2;
10650 SrcVT = PackVT;
10651 PackOpcode = X86ISD::PACKUS;
10652 return true;
10653 }
10654 }
10655 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10656 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10657 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10658 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10659 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10660 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10661 V1 = N1;
10662 V2 = N2;
10663 SrcVT = PackVT;
10664 PackOpcode = X86ISD::PACKSS;
10665 return true;
10666 }
10667 return false;
10668 };
10669
10670 // Attempt to match against wider and wider compaction patterns.
10671 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10672 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10673 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10674
10675 // Try binary shuffle.
10676 SmallVector<int, 32> BinaryMask;
10677 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10678 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10679 if (MatchPACK(V1, V2, PackVT))
10680 return true;
10681
10682 // Try unary shuffle.
10683 SmallVector<int, 32> UnaryMask;
10684 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10685 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10686 if (MatchPACK(V1, V1, PackVT))
10687 return true;
10688 }
10689
10690 return false;
10691}
10692
10694 SDValue V2, ArrayRef<int> Mask,
10695 const X86Subtarget &Subtarget,
10696 SelectionDAG &DAG) {
10697 MVT PackVT;
10698 unsigned PackOpcode;
10699 unsigned SizeBits = VT.getSizeInBits();
10700 unsigned EltBits = VT.getScalarSizeInBits();
10701 unsigned MaxStages = Log2_32(64 / EltBits);
10702 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10703 Subtarget, MaxStages))
10704 return SDValue();
10705
10706 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10707 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10708
10709 // Don't lower multi-stage packs on AVX512, truncation is better.
10710 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10711 return SDValue();
10712
10713 // Pack to the largest type possible:
10714 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10715 unsigned MaxPackBits = 16;
10716 if (CurrentEltBits > 16 &&
10717 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10718 MaxPackBits = 32;
10719
10720 // Repeatedly pack down to the target size.
10721 SDValue Res;
10722 for (unsigned i = 0; i != NumStages; ++i) {
10723 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10724 unsigned NumSrcElts = SizeBits / SrcEltBits;
10725 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10726 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10727 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10728 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10729 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10730 DAG.getBitcast(SrcVT, V2));
10731 V1 = V2 = Res;
10732 CurrentEltBits /= 2;
10733 }
10734 assert(Res && Res.getValueType() == VT &&
10735 "Failed to lower compaction shuffle");
10736 return Res;
10737}
10738
10739/// Try to emit a bitmask instruction for a shuffle.
10740///
10741/// This handles cases where we can model a blend exactly as a bitmask due to
10742/// one of the inputs being zeroable.
10744 SDValue V2, ArrayRef<int> Mask,
10745 const APInt &Zeroable,
10746 const X86Subtarget &Subtarget,
10747 SelectionDAG &DAG) {
10748 MVT MaskVT = VT;
10749 MVT EltVT = VT.getVectorElementType();
10750 SDValue Zero, AllOnes;
10751 // Use f64 if i64 isn't legal.
10752 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10753 EltVT = MVT::f64;
10754 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10755 }
10756
10757 MVT LogicVT = VT;
10758 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10759 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10760 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
10761 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10762 LogicVT =
10763 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10764 } else {
10765 Zero = DAG.getConstant(0, DL, EltVT);
10766 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10767 }
10768
10769 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10770 SDValue V;
10771 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10772 if (Zeroable[i])
10773 continue;
10774 if (Mask[i] % Size != i)
10775 return SDValue(); // Not a blend.
10776 if (!V)
10777 V = Mask[i] < Size ? V1 : V2;
10778 else if (V != (Mask[i] < Size ? V1 : V2))
10779 return SDValue(); // Can only let one input through the mask.
10780
10781 VMaskOps[i] = AllOnes;
10782 }
10783 if (!V)
10784 return SDValue(); // No non-zeroable elements!
10785
10786 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10787 VMask = DAG.getBitcast(LogicVT, VMask);
10788 V = DAG.getBitcast(LogicVT, V);
10789 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10790 return DAG.getBitcast(VT, And);
10791}
10792
10793/// Try to emit a blend instruction for a shuffle using bit math.
10794///
10795/// This is used as a fallback approach when first class blend instructions are
10796/// unavailable. Currently it is only suitable for integer vectors, but could
10797/// be generalized for floating point vectors if desirable.
10799 SDValue V2, ArrayRef<int> Mask,
10800 SelectionDAG &DAG) {
10801 assert(VT.isInteger() && "Only supports integer vector types!");
10802 MVT EltVT = VT.getVectorElementType();
10803 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10804 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10806 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10807 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10808 return SDValue(); // Shuffled input!
10809 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10810 }
10811
10812 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10813 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10814}
10815
10817 SDValue PreservedSrc,
10818 const X86Subtarget &Subtarget,
10819 SelectionDAG &DAG);
10820
10823 const APInt &Zeroable, bool &ForceV1Zero,
10824 bool &ForceV2Zero, uint64_t &BlendMask) {
10825 bool V1IsZeroOrUndef =
10827 bool V2IsZeroOrUndef =
10828 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10829
10830 BlendMask = 0;
10831 ForceV1Zero = false, ForceV2Zero = false;
10832 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10833
10834 int NumElts = Mask.size();
10835 int NumLanes = VT.getSizeInBits() / 128;
10836 int NumEltsPerLane = NumElts / NumLanes;
10837 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10838
10839 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10840 // then ensure the blend mask part for that lane just references that input.
10841 bool ForceWholeLaneMasks =
10842 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10843
10844 // Attempt to generate the binary blend mask. If an input is zero then
10845 // we can use any lane.
10846 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10847 // Keep track of the inputs used per lane.
10848 bool LaneV1InUse = false;
10849 bool LaneV2InUse = false;
10850 uint64_t LaneBlendMask = 0;
10851 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10852 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10853 int M = Mask[Elt];
10854 if (M == SM_SentinelUndef)
10855 continue;
10856 if (M == Elt || (0 <= M && M < NumElts &&
10857 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10858 Mask[Elt] = Elt;
10859 LaneV1InUse = true;
10860 continue;
10861 }
10862 if (M == (Elt + NumElts) ||
10863 (NumElts <= M &&
10864 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10865 LaneBlendMask |= 1ull << LaneElt;
10866 Mask[Elt] = Elt + NumElts;
10867 LaneV2InUse = true;
10868 continue;
10869 }
10870 if (Zeroable[Elt]) {
10871 if (V1IsZeroOrUndef) {
10872 ForceV1Zero = true;
10873 Mask[Elt] = Elt;
10874 LaneV1InUse = true;
10875 continue;
10876 }
10877 if (V2IsZeroOrUndef) {
10878 ForceV2Zero = true;
10879 LaneBlendMask |= 1ull << LaneElt;
10880 Mask[Elt] = Elt + NumElts;
10881 LaneV2InUse = true;
10882 continue;
10883 }
10884 }
10885 return false;
10886 }
10887
10888 // If we only used V2 then splat the lane blend mask to avoid any demanded
10889 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10890 // blend mask bit).
10891 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10892 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10893
10894 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10895 }
10896 return true;
10897}
10898
10899/// Try to emit a blend instruction for a shuffle.
10900///
10901/// This doesn't do any checks for the availability of instructions for blending
10902/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10903/// be matched in the backend with the type given. What it does check for is
10904/// that the shuffle mask is a blend, or convertible into a blend with zero.
10906 SDValue V2, ArrayRef<int> Original,
10907 const APInt &Zeroable,
10908 const X86Subtarget &Subtarget,
10909 SelectionDAG &DAG) {
10910 uint64_t BlendMask = 0;
10911 bool ForceV1Zero = false, ForceV2Zero = false;
10912 SmallVector<int, 64> Mask(Original);
10913 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10914 BlendMask))
10915 return SDValue();
10916
10917 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10918 if (ForceV1Zero)
10919 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10920 if (ForceV2Zero)
10921 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10922
10923 unsigned NumElts = VT.getVectorNumElements();
10924
10925 switch (VT.SimpleTy) {
10926 case MVT::v4i64:
10927 case MVT::v8i32:
10928 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10929 [[fallthrough]];
10930 case MVT::v4f64:
10931 case MVT::v8f32:
10932 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10933 [[fallthrough]];
10934 case MVT::v2f64:
10935 case MVT::v2i64:
10936 case MVT::v4f32:
10937 case MVT::v4i32:
10938 case MVT::v8i16:
10939 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10940 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10941 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10942 case MVT::v16i16: {
10943 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10944 SmallVector<int, 8> RepeatedMask;
10945 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10946 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10947 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10948 BlendMask = 0;
10949 for (int i = 0; i < 8; ++i)
10950 if (RepeatedMask[i] >= 8)
10951 BlendMask |= 1ull << i;
10952 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10953 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10954 }
10955 // Use PBLENDW for lower/upper lanes and then blend lanes.
10956 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10957 // merge to VSELECT where useful.
10958 uint64_t LoMask = BlendMask & 0xFF;
10959 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10960 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10961 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10962 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10963 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10964 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10965 return DAG.getVectorShuffle(
10966 MVT::v16i16, DL, Lo, Hi,
10967 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10968 }
10969 [[fallthrough]];
10970 }
10971 case MVT::v32i8:
10972 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10973 [[fallthrough]];
10974 case MVT::v16i8: {
10975 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10976
10977 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10978 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10979 Subtarget, DAG))
10980 return Masked;
10981
10982 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10983 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10984 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10985 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10986 }
10987
10988 // If we have VPTERNLOG, we can use that as a bit blend.
10989 if (Subtarget.hasVLX())
10990 if (SDValue BitBlend =
10991 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10992 return BitBlend;
10993
10994 // Scale the blend by the number of bytes per element.
10995 int Scale = VT.getScalarSizeInBits() / 8;
10996
10997 // This form of blend is always done on bytes. Compute the byte vector
10998 // type.
10999 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11000
11001 // x86 allows load folding with blendvb from the 2nd source operand. But
11002 // we are still using LLVM select here (see comment below), so that's V1.
11003 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11004 // allow that load-folding possibility.
11005 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11007 std::swap(V1, V2);
11008 }
11009
11010 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11011 // mix of LLVM's code generator and the x86 backend. We tell the code
11012 // generator that boolean values in the elements of an x86 vector register
11013 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11014 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11015 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11016 // of the element (the remaining are ignored) and 0 in that high bit would
11017 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11018 // the LLVM model for boolean values in vector elements gets the relevant
11019 // bit set, it is set backwards and over constrained relative to x86's
11020 // actual model.
11021 SmallVector<SDValue, 32> VSELECTMask;
11022 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11023 for (int j = 0; j < Scale; ++j)
11024 VSELECTMask.push_back(
11025 Mask[i] < 0
11026 ? DAG.getUNDEF(MVT::i8)
11027 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11028
11029 V1 = DAG.getBitcast(BlendVT, V1);
11030 V2 = DAG.getBitcast(BlendVT, V2);
11031 return DAG.getBitcast(
11032 VT,
11033 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11034 V1, V2));
11035 }
11036 case MVT::v16f32:
11037 case MVT::v8f64:
11038 case MVT::v8i64:
11039 case MVT::v16i32:
11040 case MVT::v32i16:
11041 case MVT::v64i8: {
11042 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11043 bool OptForSize = DAG.shouldOptForSize();
11044 if (!OptForSize) {
11045 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11046 Subtarget, DAG))
11047 return Masked;
11048 }
11049
11050 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11051 // masked move.
11052 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11053 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11054 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11055 }
11056 default:
11057 llvm_unreachable("Not a supported integer vector type!");
11058 }
11059}
11060
11061/// Try to lower as a blend of elements from two inputs followed by
11062/// a single-input permutation.
11063///
11064/// This matches the pattern where we can blend elements from two inputs and
11065/// then reduce the shuffle to a single-input permutation.
11067 SDValue V1, SDValue V2,
11068 ArrayRef<int> Mask,
11069 SelectionDAG &DAG,
11070 bool ImmBlends = false) {
11071 // We build up the blend mask while checking whether a blend is a viable way
11072 // to reduce the shuffle.
11073 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11074 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11075
11076 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11077 if (Mask[i] < 0)
11078 continue;
11079
11080 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11081
11082 if (BlendMask[Mask[i] % Size] < 0)
11083 BlendMask[Mask[i] % Size] = Mask[i];
11084 else if (BlendMask[Mask[i] % Size] != Mask[i])
11085 return SDValue(); // Can't blend in the needed input!
11086
11087 PermuteMask[i] = Mask[i] % Size;
11088 }
11089
11090 // If only immediate blends, then bail if the blend mask can't be widened to
11091 // i16.
11092 unsigned EltSize = VT.getScalarSizeInBits();
11093 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11094 return SDValue();
11095
11096 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11097 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11098}
11099
11100/// Try to lower as an unpack of elements from two inputs followed by
11101/// a single-input permutation.
11102///
11103/// This matches the pattern where we can unpack elements from two inputs and
11104/// then reduce the shuffle to a single-input (wider) permutation.
11106 SDValue V1, SDValue V2,
11107 ArrayRef<int> Mask,
11108 SelectionDAG &DAG) {
11109 int NumElts = Mask.size();
11110 int NumLanes = VT.getSizeInBits() / 128;
11111 int NumLaneElts = NumElts / NumLanes;
11112 int NumHalfLaneElts = NumLaneElts / 2;
11113
11114 bool MatchLo = true, MatchHi = true;
11115 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11116
11117 // Determine UNPCKL/UNPCKH type and operand order.
11118 for (int Elt = 0; Elt != NumElts; ++Elt) {
11119 int M = Mask[Elt];
11120 if (M < 0)
11121 continue;
11122
11123 // Normalize the mask value depending on whether it's V1 or V2.
11124 int NormM = M;
11125 SDValue &Op = Ops[Elt & 1];
11126 if (M < NumElts && (Op.isUndef() || Op == V1))
11127 Op = V1;
11128 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11129 Op = V2;
11130 NormM -= NumElts;
11131 } else
11132 return SDValue();
11133
11134 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11135 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11136 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11137 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11138 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11139 if (MatchLoAnyLane || MatchHiAnyLane) {
11140 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11141 "Failed to match UNPCKLO/UNPCKHI");
11142 break;
11143 }
11144 }
11145 MatchLo &= MatchLoAnyLane;
11146 MatchHi &= MatchHiAnyLane;
11147 if (!MatchLo && !MatchHi)
11148 return SDValue();
11149 }
11150 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11151
11152 // Element indices have changed after unpacking. Calculate permute mask
11153 // so that they will be put back to the position as dictated by the
11154 // original shuffle mask indices.
11155 SmallVector<int, 32> PermuteMask(NumElts, -1);
11156 for (int Elt = 0; Elt != NumElts; ++Elt) {
11157 int M = Mask[Elt];
11158 if (M < 0)
11159 continue;
11160 int NormM = M;
11161 if (NumElts <= M)
11162 NormM -= NumElts;
11163 bool IsFirstOp = M < NumElts;
11164 int BaseMaskElt =
11165 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11166 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11167 PermuteMask[Elt] = BaseMaskElt;
11168 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11169 PermuteMask[Elt] = BaseMaskElt + 1;
11170 assert(PermuteMask[Elt] != -1 &&
11171 "Input mask element is defined but failed to assign permute mask");
11172 }
11173
11174 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11175 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11176 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11177}
11178
11179/// Try to lower a shuffle as a permute of the inputs followed by an
11180/// UNPCK instruction.
11181///
11182/// This specifically targets cases where we end up with alternating between
11183/// the two inputs, and so can permute them into something that feeds a single
11184/// UNPCK instruction. Note that this routine only targets integer vectors
11185/// because for floating point vectors we have a generalized SHUFPS lowering
11186/// strategy that handles everything that doesn't *exactly* match an unpack,
11187/// making this clever lowering unnecessary.
11189 SDValue V1, SDValue V2,
11190 ArrayRef<int> Mask,
11191 const X86Subtarget &Subtarget,
11192 SelectionDAG &DAG) {
11193 int Size = Mask.size();
11194 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11195
11196 // This routine only supports 128-bit integer dual input vectors.
11197 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11198 return SDValue();
11199
11200 int NumLoInputs =
11201 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11202 int NumHiInputs =
11203 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11204
11205 bool UnpackLo = NumLoInputs >= NumHiInputs;
11206
11207 auto TryUnpack = [&](int ScalarSize, int Scale) {
11208 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11209 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11210
11211 for (int i = 0; i < Size; ++i) {
11212 if (Mask[i] < 0)
11213 continue;
11214
11215 // Each element of the unpack contains Scale elements from this mask.
11216 int UnpackIdx = i / Scale;
11217
11218 // We only handle the case where V1 feeds the first slots of the unpack.
11219 // We rely on canonicalization to ensure this is the case.
11220 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11221 return SDValue();
11222
11223 // Setup the mask for this input. The indexing is tricky as we have to
11224 // handle the unpack stride.
11225 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11226 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11227 Mask[i] % Size;
11228 }
11229
11230 // If we will have to shuffle both inputs to use the unpack, check whether
11231 // we can just unpack first and shuffle the result. If so, skip this unpack.
11232 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11233 !isNoopShuffleMask(V2Mask))
11234 return SDValue();
11235
11236 // Shuffle the inputs into place.
11237 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11238 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11239
11240 // Cast the inputs to the type we will use to unpack them.
11241 MVT UnpackVT =
11242 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11243 V1 = DAG.getBitcast(UnpackVT, V1);
11244 V2 = DAG.getBitcast(UnpackVT, V2);
11245
11246 // Unpack the inputs and cast the result back to the desired type.
11247 return DAG.getBitcast(
11248 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11249 UnpackVT, V1, V2));
11250 };
11251
11252 // We try each unpack from the largest to the smallest to try and find one
11253 // that fits this mask.
11254 int OrigScalarSize = VT.getScalarSizeInBits();
11255 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11256 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11257 return Unpack;
11258
11259 // If we're shuffling with a zero vector then we're better off not doing
11260 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11262 ISD::isBuildVectorAllZeros(V2.getNode()))
11263 return SDValue();
11264
11265 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11266 // initial unpack.
11267 if (NumLoInputs == 0 || NumHiInputs == 0) {
11268 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11269 "We have to have *some* inputs!");
11270 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11271
11272 // FIXME: We could consider the total complexity of the permute of each
11273 // possible unpacking. Or at the least we should consider how many
11274 // half-crossings are created.
11275 // FIXME: We could consider commuting the unpacks.
11276
11277 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11278 for (int i = 0; i < Size; ++i) {
11279 if (Mask[i] < 0)
11280 continue;
11281
11282 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11283
11284 PermMask[i] =
11285 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11286 }
11287 return DAG.getVectorShuffle(
11288 VT, DL,
11289 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11290 V1, V2),
11291 DAG.getUNDEF(VT), PermMask);
11292 }
11293
11294 return SDValue();
11295}
11296
11297/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11298/// permuting the elements of the result in place.
11300 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11301 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11302 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11303 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11304 (VT.is512BitVector() && !Subtarget.hasBWI()))
11305 return SDValue();
11306
11307 // We don't currently support lane crossing permutes.
11308 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11309 return SDValue();
11310
11311 int Scale = VT.getScalarSizeInBits() / 8;
11312 int NumLanes = VT.getSizeInBits() / 128;
11313 int NumElts = VT.getVectorNumElements();
11314 int NumEltsPerLane = NumElts / NumLanes;
11315
11316 // Determine range of mask elts.
11317 bool Blend1 = true;
11318 bool Blend2 = true;
11319 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11320 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11321 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11322 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11323 int M = Mask[Lane + Elt];
11324 if (M < 0)
11325 continue;
11326 if (M < NumElts) {
11327 Blend1 &= (M == (Lane + Elt));
11328 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11329 M = M % NumEltsPerLane;
11330 Range1.first = std::min(Range1.first, M);
11331 Range1.second = std::max(Range1.second, M);
11332 } else {
11333 M -= NumElts;
11334 Blend2 &= (M == (Lane + Elt));
11335 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11336 M = M % NumEltsPerLane;
11337 Range2.first = std::min(Range2.first, M);
11338 Range2.second = std::max(Range2.second, M);
11339 }
11340 }
11341 }
11342
11343 // Bail if we don't need both elements.
11344 // TODO - it might be worth doing this for unary shuffles if the permute
11345 // can be widened.
11346 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11347 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11348 return SDValue();
11349
11350 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11351 return SDValue();
11352
11353 // Rotate the 2 ops so we can access both ranges, then permute the result.
11354 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11355 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11356 SDValue Rotate = DAG.getBitcast(
11357 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11358 DAG.getBitcast(ByteVT, Lo),
11359 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11360 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11361 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11362 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11363 int M = Mask[Lane + Elt];
11364 if (M < 0)
11365 continue;
11366 if (M < NumElts)
11367 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11368 else
11369 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11370 }
11371 }
11372 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11373 };
11374
11375 // Check if the ranges are small enough to rotate from either direction.
11376 if (Range2.second < Range1.first)
11377 return RotateAndPermute(V1, V2, Range1.first, 0);
11378 if (Range1.second < Range2.first)
11379 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11380 return SDValue();
11381}
11382
11384 return isUndefOrEqual(Mask, 0);
11385}
11386
11388 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11389}
11390
11391/// Check if the Mask consists of the same element repeated multiple times.
11393 size_t NumUndefs = 0;
11394 std::optional<int> UniqueElt;
11395 for (int Elt : Mask) {
11396 if (Elt == SM_SentinelUndef) {
11397 NumUndefs++;
11398 continue;
11399 }
11400 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11401 return false;
11402 UniqueElt = Elt;
11403 }
11404 // Make sure the element is repeated enough times by checking the number of
11405 // undefs is small.
11406 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11407}
11408
11409/// Generic routine to decompose a shuffle and blend into independent
11410/// blends and permutes.
11411///
11412/// This matches the extremely common pattern for handling combined
11413/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11414/// operations. It will try to pick the best arrangement of shuffles and
11415/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11417 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11418 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11419 int NumElts = Mask.size();
11420 int NumLanes = VT.getSizeInBits() / 128;
11421 int NumEltsPerLane = NumElts / NumLanes;
11422
11423 // Shuffle the input elements into the desired positions in V1 and V2 and
11424 // unpack/blend them together.
11425 bool IsAlternating = true;
11426 bool V1Zero = true, V2Zero = true;
11427 SmallVector<int, 32> V1Mask(NumElts, -1);
11428 SmallVector<int, 32> V2Mask(NumElts, -1);
11429 SmallVector<int, 32> FinalMask(NumElts, -1);
11430 for (int i = 0; i < NumElts; ++i) {
11431 int M = Mask[i];
11432 if (M >= 0 && M < NumElts) {
11433 V1Mask[i] = M;
11434 FinalMask[i] = i;
11435 V1Zero &= Zeroable[i];
11436 IsAlternating &= (i & 1) == 0;
11437 } else if (M >= NumElts) {
11438 V2Mask[i] = M - NumElts;
11439 FinalMask[i] = i + NumElts;
11440 V2Zero &= Zeroable[i];
11441 IsAlternating &= (i & 1) == 1;
11442 }
11443 }
11444
11445 // If we effectively only demand the 0'th element of \p Input, and not only
11446 // as 0'th element, then broadcast said input,
11447 // and change \p InputMask to be a no-op (identity) mask.
11448 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11449 &DAG](SDValue &Input,
11450 MutableArrayRef<int> InputMask) {
11451 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11452 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11453 !X86::mayFoldLoad(Input, Subtarget)))
11454 return;
11455 if (isNoopShuffleMask(InputMask))
11456 return;
11457 assert(isBroadcastShuffleMask(InputMask) &&
11458 "Expected to demand only the 0'th element.");
11459 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11460 for (auto I : enumerate(InputMask)) {
11461 int &InputMaskElt = I.value();
11462 if (InputMaskElt >= 0)
11463 InputMaskElt = I.index();
11464 }
11465 };
11466
11467 // Currently, we may need to produce one shuffle per input, and blend results.
11468 // It is possible that the shuffle for one of the inputs is already a no-op.
11469 // See if we can simplify non-no-op shuffles into broadcasts,
11470 // which we consider to be strictly better than an arbitrary shuffle.
11471 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11473 canonicalizeBroadcastableInput(V1, V1Mask);
11474 canonicalizeBroadcastableInput(V2, V2Mask);
11475 }
11476
11477 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11478 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11479 // the shuffle may be able to fold with a load or other benefit. However, when
11480 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11481 // pre-shuffle first is a better strategy.
11482 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11483 // Only prefer immediate blends to unpack/rotate.
11484 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11485 DAG, true))
11486 return BlendPerm;
11487 // If either input vector provides only a single element which is repeated
11488 // multiple times, unpacking from both input vectors would generate worse
11489 // code. e.g. for
11490 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11491 // it is better to process t4 first to create a vector of t4[0], then unpack
11492 // that vector with t2.
11493 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11495 if (SDValue UnpackPerm =
11496 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11497 return UnpackPerm;
11499 DL, VT, V1, V2, Mask, Subtarget, DAG))
11500 return RotatePerm;
11501 // Unpack/rotate failed - try again with variable blends.
11502 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11503 DAG))
11504 return BlendPerm;
11505 if (VT.getScalarSizeInBits() >= 32)
11506 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11507 DL, VT, V1, V2, Mask, Subtarget, DAG))
11508 return PermUnpack;
11509 }
11510
11511 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11512 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11513 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11514 // than half the elements coming from each source.
11515 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11516 V1Mask.assign(NumElts, -1);
11517 V2Mask.assign(NumElts, -1);
11518 FinalMask.assign(NumElts, -1);
11519 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11520 for (int j = 0; j != NumEltsPerLane; ++j) {
11521 int M = Mask[i + j];
11522 if (M >= 0 && M < NumElts) {
11523 V1Mask[i + (j / 2)] = M;
11524 FinalMask[i + j] = i + (j / 2);
11525 } else if (M >= NumElts) {
11526 V2Mask[i + (j / 2)] = M - NumElts;
11527 FinalMask[i + j] = i + (j / 2) + NumElts;
11528 }
11529 }
11530 }
11531
11532 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11533 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11534 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11535}
11536
11537static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11538 const X86Subtarget &Subtarget,
11539 ArrayRef<int> Mask) {
11540 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11541 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11542
11543 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11544 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11545 int MaxSubElts = 64 / EltSizeInBits;
11546 unsigned RotateAmt, NumSubElts;
11547 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11548 MaxSubElts, NumSubElts, RotateAmt))
11549 return -1;
11550 unsigned NumElts = Mask.size();
11551 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11552 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11553 return RotateAmt;
11554}
11555
11556/// Lower shuffle using X86ISD::VROTLI rotations.
11558 ArrayRef<int> Mask,
11559 const X86Subtarget &Subtarget,
11560 SelectionDAG &DAG) {
11561 // Only XOP + AVX512 targets have bit rotation instructions.
11562 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11563 bool IsLegal =
11564 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11565 if (!IsLegal && Subtarget.hasSSE3())
11566 return SDValue();
11567
11568 MVT RotateVT;
11569 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11570 Subtarget, Mask);
11571 if (RotateAmt < 0)
11572 return SDValue();
11573
11574 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11575 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11576 // widen to vXi16 or more then existing lowering should will be better.
11577 if (!IsLegal) {
11578 if ((RotateAmt % 16) == 0)
11579 return SDValue();
11580 // TODO: Use getTargetVShiftByConstNode.
11581 unsigned ShlAmt = RotateAmt;
11582 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11583 V1 = DAG.getBitcast(RotateVT, V1);
11584 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11585 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11586 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11587 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11588 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11589 return DAG.getBitcast(VT, Rot);
11590 }
11591
11592 SDValue Rot =
11593 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11594 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11595 return DAG.getBitcast(VT, Rot);
11596}
11597
11598/// Try to match a vector shuffle as an element rotation.
11599///
11600/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11602 ArrayRef<int> Mask) {
11603 int NumElts = Mask.size();
11604
11605 // We need to detect various ways of spelling a rotation:
11606 // [11, 12, 13, 14, 15, 0, 1, 2]
11607 // [-1, 12, 13, 14, -1, -1, 1, -1]
11608 // [-1, -1, -1, -1, -1, -1, 1, 2]
11609 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11610 // [-1, 4, 5, 6, -1, -1, 9, -1]
11611 // [-1, 4, 5, 6, -1, -1, -1, -1]
11612 int Rotation = 0;
11613 SDValue Lo, Hi;
11614 for (int i = 0; i < NumElts; ++i) {
11615 int M = Mask[i];
11616 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11617 "Unexpected mask index.");
11618 if (M < 0)
11619 continue;
11620
11621 // Determine where a rotated vector would have started.
11622 int StartIdx = i - (M % NumElts);
11623 if (StartIdx == 0)
11624 // The identity rotation isn't interesting, stop.
11625 return -1;
11626
11627 // If we found the tail of a vector the rotation must be the missing
11628 // front. If we found the head of a vector, it must be how much of the
11629 // head.
11630 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11631
11632 if (Rotation == 0)
11633 Rotation = CandidateRotation;
11634 else if (Rotation != CandidateRotation)
11635 // The rotations don't match, so we can't match this mask.
11636 return -1;
11637
11638 // Compute which value this mask is pointing at.
11639 SDValue MaskV = M < NumElts ? V1 : V2;
11640
11641 // Compute which of the two target values this index should be assigned
11642 // to. This reflects whether the high elements are remaining or the low
11643 // elements are remaining.
11644 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11645
11646 // Either set up this value if we've not encountered it before, or check
11647 // that it remains consistent.
11648 if (!TargetV)
11649 TargetV = MaskV;
11650 else if (TargetV != MaskV)
11651 // This may be a rotation, but it pulls from the inputs in some
11652 // unsupported interleaving.
11653 return -1;
11654 }
11655
11656 // Check that we successfully analyzed the mask, and normalize the results.
11657 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11658 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11659 if (!Lo)
11660 Lo = Hi;
11661 else if (!Hi)
11662 Hi = Lo;
11663
11664 V1 = Lo;
11665 V2 = Hi;
11666
11667 return Rotation;
11668}
11669
11670/// Try to lower a vector shuffle as a byte rotation.
11671///
11672/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11673/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11674/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11675/// try to generically lower a vector shuffle through such an pattern. It
11676/// does not check for the profitability of lowering either as PALIGNR or
11677/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11678/// This matches shuffle vectors that look like:
11679///
11680/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11681///
11682/// Essentially it concatenates V1 and V2, shifts right by some number of
11683/// elements, and takes the low elements as the result. Note that while this is
11684/// specified as a *right shift* because x86 is little-endian, it is a *left
11685/// rotate* of the vector lanes.
11687 ArrayRef<int> Mask) {
11688 // Don't accept any shuffles with zero elements.
11689 if (isAnyZero(Mask))
11690 return -1;
11691
11692 // PALIGNR works on 128-bit lanes.
11693 SmallVector<int, 16> RepeatedMask;
11694 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11695 return -1;
11696
11697 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11698 if (Rotation <= 0)
11699 return -1;
11700
11701 // PALIGNR rotates bytes, so we need to scale the
11702 // rotation based on how many bytes are in the vector lane.
11703 int NumElts = RepeatedMask.size();
11704 int Scale = 16 / NumElts;
11705 return Rotation * Scale;
11706}
11707
11709 SDValue V2, ArrayRef<int> Mask,
11710 const X86Subtarget &Subtarget,
11711 SelectionDAG &DAG) {
11712 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11713
11714 SDValue Lo = V1, Hi = V2;
11715 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11716 if (ByteRotation <= 0)
11717 return SDValue();
11718
11719 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11720 // PSLLDQ/PSRLDQ.
11721 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11722 Lo = DAG.getBitcast(ByteVT, Lo);
11723 Hi = DAG.getBitcast(ByteVT, Hi);
11724
11725 // SSSE3 targets can use the palignr instruction.
11726 if (Subtarget.hasSSSE3()) {
11727 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11728 "512-bit PALIGNR requires BWI instructions");
11729 return DAG.getBitcast(
11730 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11731 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11732 }
11733
11734 assert(VT.is128BitVector() &&
11735 "Rotate-based lowering only supports 128-bit lowering!");
11736 assert(Mask.size() <= 16 &&
11737 "Can shuffle at most 16 bytes in a 128-bit vector!");
11738 assert(ByteVT == MVT::v16i8 &&
11739 "SSE2 rotate lowering only needed for v16i8!");
11740
11741 // Default SSE2 implementation
11742 int LoByteShift = 16 - ByteRotation;
11743 int HiByteShift = ByteRotation;
11744
11745 SDValue LoShift =
11746 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11747 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11748 SDValue HiShift =
11749 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11750 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11751 return DAG.getBitcast(VT,
11752 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11753}
11754
11755/// Try to lower a vector shuffle as a dword/qword rotation.
11756///
11757/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11758/// rotation of the concatenation of two vectors; This routine will
11759/// try to generically lower a vector shuffle through such an pattern.
11760///
11761/// Essentially it concatenates V1 and V2, shifts right by some number of
11762/// elements, and takes the low elements as the result. Note that while this is
11763/// specified as a *right shift* because x86 is little-endian, it is a *left
11764/// rotate* of the vector lanes.
11766 SDValue V2, ArrayRef<int> Mask,
11767 const APInt &Zeroable,
11768 const X86Subtarget &Subtarget,
11769 SelectionDAG &DAG) {
11770 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11771 "Only 32-bit and 64-bit elements are supported!");
11772
11773 // 128/256-bit vectors are only supported with VLX.
11774 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11775 && "VLX required for 128/256-bit vectors");
11776
11777 SDValue Lo = V1, Hi = V2;
11778 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11779 if (0 < Rotation)
11780 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11781 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11782
11783 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11784 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11785 // TODO: We can probably make this more aggressive and use shift-pairs like
11786 // lowerShuffleAsByteShiftMask.
11787 unsigned NumElts = Mask.size();
11788 unsigned ZeroLo = Zeroable.countr_one();
11789 unsigned ZeroHi = Zeroable.countl_one();
11790 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11791 if (!ZeroLo && !ZeroHi)
11792 return SDValue();
11793
11794 if (ZeroLo) {
11795 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11796 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11797 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11798 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11799 getZeroVector(VT, Subtarget, DAG, DL),
11800 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11801 }
11802
11803 if (ZeroHi) {
11804 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11805 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11806 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11807 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11808 getZeroVector(VT, Subtarget, DAG, DL), Src,
11809 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11810 }
11811
11812 return SDValue();
11813}
11814
11815/// Try to lower a vector shuffle as a byte shift sequence.
11817 SDValue V2, ArrayRef<int> Mask,
11818 const APInt &Zeroable,
11819 const X86Subtarget &Subtarget,
11820 SelectionDAG &DAG) {
11821 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11822 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11823
11824 // We need a shuffle that has zeros at one/both ends and a sequential
11825 // shuffle from one source within.
11826 unsigned ZeroLo = Zeroable.countr_one();
11827 unsigned ZeroHi = Zeroable.countl_one();
11828 if (!ZeroLo && !ZeroHi)
11829 return SDValue();
11830
11831 unsigned NumElts = Mask.size();
11832 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11833 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11834 return SDValue();
11835
11836 unsigned Scale = VT.getScalarSizeInBits() / 8;
11837 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11838 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11839 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11840 return SDValue();
11841
11842 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11843 Res = DAG.getBitcast(MVT::v16i8, Res);
11844
11845 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11846 // inner sequential set of elements, possibly offset:
11847 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11848 // 01234567 --> 4567zzzz --> zzzzz456
11849 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11850 if (ZeroLo == 0) {
11851 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11852 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11853 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11854 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11855 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11856 } else if (ZeroHi == 0) {
11857 unsigned Shift = Mask[ZeroLo] % NumElts;
11858 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11859 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11860 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11861 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11862 } else if (!Subtarget.hasSSSE3()) {
11863 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11864 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11865 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11866 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11867 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11868 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11869 Shift += Mask[ZeroLo] % NumElts;
11870 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11871 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11872 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11873 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11874 } else
11875 return SDValue();
11876
11877 return DAG.getBitcast(VT, Res);
11878}
11879
11880/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11881///
11882/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11883/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11884/// matches elements from one of the input vectors shuffled to the left or
11885/// right with zeroable elements 'shifted in'. It handles both the strictly
11886/// bit-wise element shifts and the byte shift across an entire 128-bit double
11887/// quad word lane.
11888///
11889/// PSHL : (little-endian) left bit shift.
11890/// [ zz, 0, zz, 2 ]
11891/// [ -1, 4, zz, -1 ]
11892/// PSRL : (little-endian) right bit shift.
11893/// [ 1, zz, 3, zz]
11894/// [ -1, -1, 7, zz]
11895/// PSLLDQ : (little-endian) left byte shift
11896/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11897/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11898/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11899/// PSRLDQ : (little-endian) right byte shift
11900/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11901/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11902/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11903static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11904 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11905 int MaskOffset, const APInt &Zeroable,
11906 const X86Subtarget &Subtarget) {
11907 int Size = Mask.size();
11908 unsigned SizeInBits = Size * ScalarSizeInBits;
11909
11910 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11911 for (int i = 0; i < Size; i += Scale)
11912 for (int j = 0; j < Shift; ++j)
11913 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11914 return false;
11915
11916 return true;
11917 };
11918
11919 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11920 for (int i = 0; i != Size; i += Scale) {
11921 unsigned Pos = Left ? i + Shift : i;
11922 unsigned Low = Left ? i : i + Shift;
11923 unsigned Len = Scale - Shift;
11924 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11925 return -1;
11926 }
11927
11928 int ShiftEltBits = ScalarSizeInBits * Scale;
11929 bool ByteShift = ShiftEltBits > 64;
11930 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11931 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11932 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11933
11934 // Normalize the scale for byte shifts to still produce an i64 element
11935 // type.
11936 Scale = ByteShift ? Scale / 2 : Scale;
11937
11938 // We need to round trip through the appropriate type for the shift.
11939 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11940 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11941 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11942 return (int)ShiftAmt;
11943 };
11944
11945 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11946 // keep doubling the size of the integer elements up to that. We can
11947 // then shift the elements of the integer vector by whole multiples of
11948 // their width within the elements of the larger integer vector. Test each
11949 // multiple to see if we can find a match with the moved element indices
11950 // and that the shifted in elements are all zeroable.
11951 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11952 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11953 for (int Shift = 1; Shift != Scale; ++Shift)
11954 for (bool Left : {true, false})
11955 if (CheckZeros(Shift, Scale, Left)) {
11956 int ShiftAmt = MatchShift(Shift, Scale, Left);
11957 if (0 < ShiftAmt)
11958 return ShiftAmt;
11959 }
11960
11961 // no match
11962 return -1;
11963}
11964
11966 SDValue V2, ArrayRef<int> Mask,
11967 const APInt &Zeroable,
11968 const X86Subtarget &Subtarget,
11969 SelectionDAG &DAG, bool BitwiseOnly) {
11970 int Size = Mask.size();
11971 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11972
11973 MVT ShiftVT;
11974 SDValue V = V1;
11975 unsigned Opcode;
11976
11977 // Try to match shuffle against V1 shift.
11978 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11979 Mask, 0, Zeroable, Subtarget);
11980
11981 // If V1 failed, try to match shuffle against V2 shift.
11982 if (ShiftAmt < 0) {
11983 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11984 Mask, Size, Zeroable, Subtarget);
11985 V = V2;
11986 }
11987
11988 if (ShiftAmt < 0)
11989 return SDValue();
11990
11991 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11992 return SDValue();
11993
11994 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11995 "Illegal integer vector type");
11996 V = DAG.getBitcast(ShiftVT, V);
11997 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11998 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11999 return DAG.getBitcast(VT, V);
12000}
12001
12002// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12003// Remainder of lower half result is zero and upper half is all undef.
12004static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12005 ArrayRef<int> Mask, uint64_t &BitLen,
12006 uint64_t &BitIdx, const APInt &Zeroable) {
12007 int Size = Mask.size();
12008 int HalfSize = Size / 2;
12009 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12010 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12011
12012 // Upper half must be undefined.
12013 if (!isUndefUpperHalf(Mask))
12014 return false;
12015
12016 // Determine the extraction length from the part of the
12017 // lower half that isn't zeroable.
12018 int Len = HalfSize;
12019 for (; Len > 0; --Len)
12020 if (!Zeroable[Len - 1])
12021 break;
12022 assert(Len > 0 && "Zeroable shuffle mask");
12023
12024 // Attempt to match first Len sequential elements from the lower half.
12025 SDValue Src;
12026 int Idx = -1;
12027 for (int i = 0; i != Len; ++i) {
12028 int M = Mask[i];
12029 if (M == SM_SentinelUndef)
12030 continue;
12031 SDValue &V = (M < Size ? V1 : V2);
12032 M = M % Size;
12033
12034 // The extracted elements must start at a valid index and all mask
12035 // elements must be in the lower half.
12036 if (i > M || M >= HalfSize)
12037 return false;
12038
12039 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12040 Src = V;
12041 Idx = M - i;
12042 continue;
12043 }
12044 return false;
12045 }
12046
12047 if (!Src || Idx < 0)
12048 return false;
12049
12050 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12051 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12052 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12053 V1 = Src;
12054 return true;
12055}
12056
12057// INSERTQ: Extract lowest Len elements from lower half of second source and
12058// insert over first source, starting at Idx.
12059// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12060static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12061 ArrayRef<int> Mask, uint64_t &BitLen,
12062 uint64_t &BitIdx) {
12063 int Size = Mask.size();
12064 int HalfSize = Size / 2;
12065 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12066
12067 // Upper half must be undefined.
12068 if (!isUndefUpperHalf(Mask))
12069 return false;
12070
12071 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12072 SDValue Base;
12073
12074 // Attempt to match first source from mask before insertion point.
12075 if (isUndefInRange(Mask, 0, Idx)) {
12076 /* EMPTY */
12077 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12078 Base = V1;
12079 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12080 Base = V2;
12081 } else {
12082 continue;
12083 }
12084
12085 // Extend the extraction length looking to match both the insertion of
12086 // the second source and the remaining elements of the first.
12087 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12088 SDValue Insert;
12089 int Len = Hi - Idx;
12090
12091 // Match insertion.
12092 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12093 Insert = V1;
12094 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12095 Insert = V2;
12096 } else {
12097 continue;
12098 }
12099
12100 // Match the remaining elements of the lower half.
12101 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12102 /* EMPTY */
12103 } else if ((!Base || (Base == V1)) &&
12104 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12105 Base = V1;
12106 } else if ((!Base || (Base == V2)) &&
12107 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12108 Size + Hi)) {
12109 Base = V2;
12110 } else {
12111 continue;
12112 }
12113
12114 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12115 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12116 V1 = Base;
12117 V2 = Insert;
12118 return true;
12119 }
12120 }
12121
12122 return false;
12123}
12124
12125/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12127 SDValue V2, ArrayRef<int> Mask,
12128 const APInt &Zeroable, SelectionDAG &DAG) {
12129 uint64_t BitLen, BitIdx;
12130 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12131 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12132 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12133 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12134
12135 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12136 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12137 V2 ? V2 : DAG.getUNDEF(VT),
12138 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12139 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12140
12141 return SDValue();
12142}
12143
12144/// Lower a vector shuffle as a zero or any extension.
12145///
12146/// Given a specific number of elements, element bit width, and extension
12147/// stride, produce either a zero or any extension based on the available
12148/// features of the subtarget. The extended elements are consecutive and
12149/// begin and can start from an offsetted element index in the input; to
12150/// avoid excess shuffling the offset must either being in the bottom lane
12151/// or at the start of a higher lane. All extended elements must be from
12152/// the same lane.
12154 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12155 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12156 assert(Scale > 1 && "Need a scale to extend.");
12157 int EltBits = VT.getScalarSizeInBits();
12158 int NumElements = VT.getVectorNumElements();
12159 int NumEltsPerLane = 128 / EltBits;
12160 int OffsetLane = Offset / NumEltsPerLane;
12161 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12162 "Only 8, 16, and 32 bit elements can be extended.");
12163 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12164 assert(0 <= Offset && "Extension offset must be positive.");
12165 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12166 "Extension offset must be in the first lane or start an upper lane.");
12167
12168 // Check that an index is in same lane as the base offset.
12169 auto SafeOffset = [&](int Idx) {
12170 return OffsetLane == (Idx / NumEltsPerLane);
12171 };
12172
12173 // Shift along an input so that the offset base moves to the first element.
12174 auto ShuffleOffset = [&](SDValue V) {
12175 if (!Offset)
12176 return V;
12177
12178 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12179 for (int i = 0; i * Scale < NumElements; ++i) {
12180 int SrcIdx = i + Offset;
12181 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12182 }
12183 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12184 };
12185
12186 // Found a valid a/zext mask! Try various lowering strategies based on the
12187 // input type and available ISA extensions.
12188 if (Subtarget.hasSSE41()) {
12189 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12190 // PUNPCK will catch this in a later shuffle match.
12191 if (Offset && Scale == 2 && VT.is128BitVector())
12192 return SDValue();
12193 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12194 NumElements / Scale);
12195 InputV = DAG.getBitcast(VT, InputV);
12196 InputV = ShuffleOffset(InputV);
12198 DL, ExtVT, InputV, DAG);
12199 return DAG.getBitcast(VT, InputV);
12200 }
12201
12202 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12203 InputV = DAG.getBitcast(VT, InputV);
12204
12205 // For any extends we can cheat for larger element sizes and use shuffle
12206 // instructions that can fold with a load and/or copy.
12207 if (AnyExt && EltBits == 32) {
12208 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12209 -1};
12210 return DAG.getBitcast(
12211 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12212 DAG.getBitcast(MVT::v4i32, InputV),
12213 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12214 }
12215 if (AnyExt && EltBits == 16 && Scale > 2) {
12216 int PSHUFDMask[4] = {Offset / 2, -1,
12217 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12218 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12219 DAG.getBitcast(MVT::v4i32, InputV),
12220 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12221 int PSHUFWMask[4] = {1, -1, -1, -1};
12222 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12223 return DAG.getBitcast(
12224 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12225 DAG.getBitcast(MVT::v8i16, InputV),
12226 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12227 }
12228
12229 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12230 // to 64-bits.
12231 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12232 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12233 assert(VT.is128BitVector() && "Unexpected vector width!");
12234
12235 int LoIdx = Offset * EltBits;
12236 SDValue Lo = DAG.getBitcast(
12237 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12238 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12239 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12240
12241 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12242 return DAG.getBitcast(VT, Lo);
12243
12244 int HiIdx = (Offset + 1) * EltBits;
12245 SDValue Hi = DAG.getBitcast(
12246 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12247 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12248 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12249 return DAG.getBitcast(VT,
12250 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12251 }
12252
12253 // If this would require more than 2 unpack instructions to expand, use
12254 // pshufb when available. We can only use more than 2 unpack instructions
12255 // when zero extending i8 elements which also makes it easier to use pshufb.
12256 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12257 assert(NumElements == 16 && "Unexpected byte vector width!");
12258 SDValue PSHUFBMask[16];
12259 for (int i = 0; i < 16; ++i) {
12260 int Idx = Offset + (i / Scale);
12261 if ((i % Scale == 0 && SafeOffset(Idx))) {
12262 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12263 continue;
12264 }
12265 PSHUFBMask[i] =
12266 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12267 }
12268 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12269 return DAG.getBitcast(
12270 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12271 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12272 }
12273
12274 // If we are extending from an offset, ensure we start on a boundary that
12275 // we can unpack from.
12276 int AlignToUnpack = Offset % (NumElements / Scale);
12277 if (AlignToUnpack) {
12278 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12279 for (int i = AlignToUnpack; i < NumElements; ++i)
12280 ShMask[i - AlignToUnpack] = i;
12281 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12282 Offset -= AlignToUnpack;
12283 }
12284
12285 // Otherwise emit a sequence of unpacks.
12286 do {
12287 unsigned UnpackLoHi = X86ISD::UNPCKL;
12288 if (Offset >= (NumElements / 2)) {
12289 UnpackLoHi = X86ISD::UNPCKH;
12290 Offset -= (NumElements / 2);
12291 }
12292
12293 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12294 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12295 : getZeroVector(InputVT, Subtarget, DAG, DL);
12296 InputV = DAG.getBitcast(InputVT, InputV);
12297 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12298 Scale /= 2;
12299 EltBits *= 2;
12300 NumElements /= 2;
12301 } while (Scale > 1);
12302 return DAG.getBitcast(VT, InputV);
12303}
12304
12305/// Try to lower a vector shuffle as a zero extension on any microarch.
12306///
12307/// This routine will try to do everything in its power to cleverly lower
12308/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12309/// check for the profitability of this lowering, it tries to aggressively
12310/// match this pattern. It will use all of the micro-architectural details it
12311/// can to emit an efficient lowering. It handles both blends with all-zero
12312/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12313/// masking out later).
12314///
12315/// The reason we have dedicated lowering for zext-style shuffles is that they
12316/// are both incredibly common and often quite performance sensitive.
12318 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12319 const APInt &Zeroable, const X86Subtarget &Subtarget,
12320 SelectionDAG &DAG) {
12321 int Bits = VT.getSizeInBits();
12322 int NumLanes = Bits / 128;
12323 int NumElements = VT.getVectorNumElements();
12324 int NumEltsPerLane = NumElements / NumLanes;
12325 assert(VT.getScalarSizeInBits() <= 32 &&
12326 "Exceeds 32-bit integer zero extension limit");
12327 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12328
12329 // Define a helper function to check a particular ext-scale and lower to it if
12330 // valid.
12331 auto Lower = [&](int Scale) -> SDValue {
12332 SDValue InputV;
12333 bool AnyExt = true;
12334 int Offset = 0;
12335 int Matches = 0;
12336 for (int i = 0; i < NumElements; ++i) {
12337 int M = Mask[i];
12338 if (M < 0)
12339 continue; // Valid anywhere but doesn't tell us anything.
12340 if (i % Scale != 0) {
12341 // Each of the extended elements need to be zeroable.
12342 if (!Zeroable[i])
12343 return SDValue();
12344
12345 // We no longer are in the anyext case.
12346 AnyExt = false;
12347 continue;
12348 }
12349
12350 // Each of the base elements needs to be consecutive indices into the
12351 // same input vector.
12352 SDValue V = M < NumElements ? V1 : V2;
12353 M = M % NumElements;
12354 if (!InputV) {
12355 InputV = V;
12356 Offset = M - (i / Scale);
12357 } else if (InputV != V)
12358 return SDValue(); // Flip-flopping inputs.
12359
12360 // Offset must start in the lowest 128-bit lane or at the start of an
12361 // upper lane.
12362 // FIXME: Is it ever worth allowing a negative base offset?
12363 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12364 (Offset % NumEltsPerLane) == 0))
12365 return SDValue();
12366
12367 // If we are offsetting, all referenced entries must come from the same
12368 // lane.
12369 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12370 return SDValue();
12371
12372 if ((M % NumElements) != (Offset + (i / Scale)))
12373 return SDValue(); // Non-consecutive strided elements.
12374 Matches++;
12375 }
12376
12377 // If we fail to find an input, we have a zero-shuffle which should always
12378 // have already been handled.
12379 // FIXME: Maybe handle this here in case during blending we end up with one?
12380 if (!InputV)
12381 return SDValue();
12382
12383 // If we are offsetting, don't extend if we only match a single input, we
12384 // can always do better by using a basic PSHUF or PUNPCK.
12385 if (Offset != 0 && Matches < 2)
12386 return SDValue();
12387
12388 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12389 InputV, Mask, Subtarget, DAG);
12390 };
12391
12392 // The widest scale possible for extending is to a 64-bit integer.
12393 assert(Bits % 64 == 0 &&
12394 "The number of bits in a vector must be divisible by 64 on x86!");
12395 int NumExtElements = Bits / 64;
12396
12397 // Each iteration, try extending the elements half as much, but into twice as
12398 // many elements.
12399 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12400 assert(NumElements % NumExtElements == 0 &&
12401 "The input vector size must be divisible by the extended size.");
12402 if (SDValue V = Lower(NumElements / NumExtElements))
12403 return V;
12404 }
12405
12406 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12407 if (Bits != 128)
12408 return SDValue();
12409
12410 // Returns one of the source operands if the shuffle can be reduced to a
12411 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12412 auto CanZExtLowHalf = [&]() {
12413 for (int i = NumElements / 2; i != NumElements; ++i)
12414 if (!Zeroable[i])
12415 return SDValue();
12416 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12417 return V1;
12418 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12419 return V2;
12420 return SDValue();
12421 };
12422
12423 if (SDValue V = CanZExtLowHalf()) {
12424 V = DAG.getBitcast(MVT::v2i64, V);
12425 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12426 return DAG.getBitcast(VT, V);
12427 }
12428
12429 // No viable ext lowering found.
12430 return SDValue();
12431}
12432
12433/// Try to get a scalar value for a specific element of a vector.
12434///
12435/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12437 SelectionDAG &DAG) {
12438 MVT VT = V.getSimpleValueType();
12439 MVT EltVT = VT.getVectorElementType();
12440 V = peekThroughBitcasts(V);
12441
12442 // If the bitcasts shift the element size, we can't extract an equivalent
12443 // element from it.
12444 MVT NewVT = V.getSimpleValueType();
12445 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12446 return SDValue();
12447
12448 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12449 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12450 // Ensure the scalar operand is the same size as the destination.
12451 // FIXME: Add support for scalar truncation where possible.
12452 SDValue S = V.getOperand(Idx);
12453 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12454 return DAG.getBitcast(EltVT, S);
12455 }
12456
12457 return SDValue();
12458}
12459
12460/// Helper to test for a load that can be folded with x86 shuffles.
12461///
12462/// This is particularly important because the set of instructions varies
12463/// significantly based on whether the operand is a load or not.
12465 return V->hasOneUse() &&
12467}
12468
12469template<typename T>
12470static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12471 T EltVT = VT.getScalarType();
12472 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12473 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12474}
12475
12476/// Try to lower insertion of a single element into a zero vector.
12477///
12478/// This is a common pattern that we have especially efficient patterns to lower
12479/// across all subtarget feature sets.
12481 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12482 const APInt &Zeroable, const X86Subtarget &Subtarget,
12483 SelectionDAG &DAG) {
12484 MVT ExtVT = VT;
12485 MVT EltVT = VT.getVectorElementType();
12486 unsigned NumElts = VT.getVectorNumElements();
12487 unsigned EltBits = VT.getScalarSizeInBits();
12488
12489 if (isSoftF16(EltVT, Subtarget))
12490 return SDValue();
12491
12492 int V2Index =
12493 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12494 Mask.begin();
12495 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12496 bool IsV1Zeroable = true;
12497 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12498 if (i != V2Index && !Zeroable[i]) {
12499 IsV1Zeroable = false;
12500 break;
12501 }
12502
12503 // Bail if a non-zero V1 isn't used in place.
12504 if (!IsV1Zeroable) {
12505 SmallVector<int, 8> V1Mask(Mask);
12506 V1Mask[V2Index] = -1;
12507 if (!isNoopShuffleMask(V1Mask))
12508 return SDValue();
12509 }
12510
12511 // Check for a single input from a SCALAR_TO_VECTOR node.
12512 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12513 // all the smarts here sunk into that routine. However, the current
12514 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12515 // vector shuffle lowering is dead.
12516 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12517 DAG);
12518 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12519 // We need to zext the scalar if it is smaller than an i32.
12520 V2S = DAG.getBitcast(EltVT, V2S);
12521 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12522 // Using zext to expand a narrow element won't work for non-zero
12523 // insertions. But we can use a masked constant vector if we're
12524 // inserting V2 into the bottom of V1.
12525 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12526 return SDValue();
12527
12528 // Zero-extend directly to i32.
12529 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12530 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12531
12532 // If we're inserting into a constant, mask off the inserted index
12533 // and OR with the zero-extended scalar.
12534 if (!IsV1Zeroable) {
12535 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12536 Bits[V2Index] = APInt::getZero(EltBits);
12537 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12538 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12539 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12540 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12541 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12542 }
12543 }
12544 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12545 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12546 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12547 // Either not inserting from the low element of the input or the input
12548 // element size is too small to use VZEXT_MOVL to clear the high bits.
12549 return SDValue();
12550 }
12551
12552 if (!IsV1Zeroable) {
12553 // If V1 can't be treated as a zero vector we have fewer options to lower
12554 // this. We can't support integer vectors or non-zero targets cheaply.
12555 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12556 if (!VT.isFloatingPoint() || V2Index != 0)
12557 return SDValue();
12558 if (!VT.is128BitVector())
12559 return SDValue();
12560
12561 // Otherwise, use MOVSD, MOVSS or MOVSH.
12562 unsigned MovOpc = 0;
12563 if (EltVT == MVT::f16)
12564 MovOpc = X86ISD::MOVSH;
12565 else if (EltVT == MVT::f32)
12566 MovOpc = X86ISD::MOVSS;
12567 else if (EltVT == MVT::f64)
12568 MovOpc = X86ISD::MOVSD;
12569 else
12570 llvm_unreachable("Unsupported floating point element type to handle!");
12571 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12572 }
12573
12574 // This lowering only works for the low element with floating point vectors.
12575 if (VT.isFloatingPoint() && V2Index != 0)
12576 return SDValue();
12577
12578 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12579 if (ExtVT != VT)
12580 V2 = DAG.getBitcast(VT, V2);
12581
12582 if (V2Index != 0) {
12583 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12584 // the desired position. Otherwise it is more efficient to do a vector
12585 // shift left. We know that we can do a vector shift left because all
12586 // the inputs are zero.
12587 if (VT.isFloatingPoint() || NumElts <= 4) {
12588 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12589 V2Shuffle[V2Index] = 0;
12590 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12591 } else {
12592 V2 = DAG.getBitcast(MVT::v16i8, V2);
12593 V2 = DAG.getNode(
12594 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12595 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12596 V2 = DAG.getBitcast(VT, V2);
12597 }
12598 }
12599 return V2;
12600}
12601
12602/// Try to lower broadcast of a single - truncated - integer element,
12603/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12604///
12605/// This assumes we have AVX2.
12607 int BroadcastIdx,
12608 const X86Subtarget &Subtarget,
12609 SelectionDAG &DAG) {
12610 assert(Subtarget.hasAVX2() &&
12611 "We can only lower integer broadcasts with AVX2!");
12612
12613 MVT EltVT = VT.getVectorElementType();
12614 MVT V0VT = V0.getSimpleValueType();
12615
12616 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12617 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12618
12619 MVT V0EltVT = V0VT.getVectorElementType();
12620 if (!V0EltVT.isInteger())
12621 return SDValue();
12622
12623 const unsigned EltSize = EltVT.getSizeInBits();
12624 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12625
12626 // This is only a truncation if the original element type is larger.
12627 if (V0EltSize <= EltSize)
12628 return SDValue();
12629
12630 assert(((V0EltSize % EltSize) == 0) &&
12631 "Scalar type sizes must all be powers of 2 on x86!");
12632
12633 const unsigned V0Opc = V0.getOpcode();
12634 const unsigned Scale = V0EltSize / EltSize;
12635 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12636
12637 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12638 V0Opc != ISD::BUILD_VECTOR)
12639 return SDValue();
12640
12641 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12642
12643 // If we're extracting non-least-significant bits, shift so we can truncate.
12644 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12645 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12646 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12647 if (const int OffsetIdx = BroadcastIdx % Scale)
12648 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12649 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12650
12651 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12652 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12653}
12654
12655/// Test whether this can be lowered with a single SHUFPS instruction.
12656///
12657/// This is used to disable more specialized lowerings when the shufps lowering
12658/// will happen to be efficient.
12660 // This routine only handles 128-bit shufps.
12661 assert(Mask.size() == 4 && "Unsupported mask size!");
12662 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12663 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12664 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12665 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12666
12667 // To lower with a single SHUFPS we need to have the low half and high half
12668 // each requiring a single input.
12669 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12670 return false;
12671 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12672 return false;
12673
12674 return true;
12675}
12676
12677/// Test whether the specified input (0 or 1) is in-place blended by the
12678/// given mask.
12679///
12680/// This returns true if the elements from a particular input are already in the
12681/// slot required by the given mask and require no permutation.
12682static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12683 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12684 int Size = Mask.size();
12685 for (int i = 0; i < Size; ++i)
12686 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12687 return false;
12688
12689 return true;
12690}
12691
12692/// If we are extracting two 128-bit halves of a vector and shuffling the
12693/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12694/// multi-shuffle lowering.
12696 SDValue N1, ArrayRef<int> Mask,
12697 SelectionDAG &DAG) {
12698 MVT VT = N0.getSimpleValueType();
12699 assert((VT.is128BitVector() &&
12700 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12701 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12702
12703 // Check that both sources are extracts of the same source vector.
12704 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12706 N0.getOperand(0) != N1.getOperand(0) ||
12707 !N0.hasOneUse() || !N1.hasOneUse())
12708 return SDValue();
12709
12710 SDValue WideVec = N0.getOperand(0);
12711 MVT WideVT = WideVec.getSimpleValueType();
12712 if (!WideVT.is256BitVector())
12713 return SDValue();
12714
12715 // Match extracts of each half of the wide source vector. Commute the shuffle
12716 // if the extract of the low half is N1.
12717 unsigned NumElts = VT.getVectorNumElements();
12718 SmallVector<int, 4> NewMask(Mask);
12719 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12720 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12721 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12723 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12724 return SDValue();
12725
12726 // Final bailout: if the mask is simple, we are better off using an extract
12727 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12728 // because that avoids a constant load from memory.
12729 if (NumElts == 4 &&
12730 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12731 return SDValue();
12732
12733 // Extend the shuffle mask with undef elements.
12734 NewMask.append(NumElts, -1);
12735
12736 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12737 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12738 NewMask);
12739 // This is free: ymm -> xmm.
12740 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12741 DAG.getVectorIdxConstant(0, DL));
12742}
12743
12744/// Try to lower broadcast of a single element.
12745///
12746/// For convenience, this code also bundles all of the subtarget feature set
12747/// filtering. While a little annoying to re-dispatch on type here, there isn't
12748/// a convenient way to factor it out.
12750 SDValue V2, ArrayRef<int> Mask,
12751 const X86Subtarget &Subtarget,
12752 SelectionDAG &DAG) {
12753 MVT EltVT = VT.getVectorElementType();
12754 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12755 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12756 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12757 return SDValue();
12758
12759 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12760 // we can only broadcast from a register with AVX2.
12761 unsigned NumEltBits = VT.getScalarSizeInBits();
12762 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12765 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12766
12767 // Check that the mask is a broadcast.
12768 int BroadcastIdx = getSplatIndex(Mask);
12769 if (BroadcastIdx < 0)
12770 return SDValue();
12771 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12772 "a sorted mask where the broadcast "
12773 "comes from V1.");
12774 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
12775
12776 // Go up the chain of (vector) values to find a scalar load that we can
12777 // combine with the broadcast.
12778 // TODO: Combine this logic with findEltLoadSrc() used by
12779 // EltsFromConsecutiveLoads().
12780 int BitOffset = BroadcastIdx * NumEltBits;
12781 SDValue V = V1;
12782 for (;;) {
12783 switch (V.getOpcode()) {
12784 case ISD::BITCAST: {
12785 V = V.getOperand(0);
12786 continue;
12787 }
12788 case ISD::CONCAT_VECTORS: {
12789 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12790 int OpIdx = BitOffset / OpBitWidth;
12791 V = V.getOperand(OpIdx);
12792 BitOffset %= OpBitWidth;
12793 continue;
12794 }
12796 // The extraction index adds to the existing offset.
12797 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12798 unsigned Idx = V.getConstantOperandVal(1);
12799 unsigned BeginOffset = Idx * EltBitWidth;
12800 BitOffset += BeginOffset;
12801 V = V.getOperand(0);
12802 continue;
12803 }
12804 case ISD::INSERT_SUBVECTOR: {
12805 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12806 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12807 int Idx = (int)V.getConstantOperandVal(2);
12808 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12809 int BeginOffset = Idx * EltBitWidth;
12810 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12811 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12812 BitOffset -= BeginOffset;
12813 V = VInner;
12814 } else {
12815 V = VOuter;
12816 }
12817 continue;
12818 }
12819 }
12820 break;
12821 }
12822 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12823 BroadcastIdx = BitOffset / NumEltBits;
12824
12825 // Do we need to bitcast the source to retrieve the original broadcast index?
12826 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12827
12828 // Check if this is a broadcast of a scalar. We special case lowering
12829 // for scalars so that we can more effectively fold with loads.
12830 // If the original value has a larger element type than the shuffle, the
12831 // broadcast element is in essence truncated. Make that explicit to ease
12832 // folding.
12833 if (BitCastSrc && VT.isInteger())
12834 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12835 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12836 return TruncBroadcast;
12837
12838 // Also check the simpler case, where we can directly reuse the scalar.
12839 if (!BitCastSrc &&
12840 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12841 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12842 V = V.getOperand(BroadcastIdx);
12843
12844 // If we can't broadcast from a register, check that the input is a load.
12845 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12846 return SDValue();
12847 } else if (ISD::isNormalLoad(V.getNode()) &&
12848 cast<LoadSDNode>(V)->isSimple()) {
12849 // We do not check for one-use of the vector load because a broadcast load
12850 // is expected to be a win for code size, register pressure, and possibly
12851 // uops even if the original vector load is not eliminated.
12852
12853 // Reduce the vector load and shuffle to a broadcasted scalar load.
12854 LoadSDNode *Ld = cast<LoadSDNode>(V);
12855 SDValue BaseAddr = Ld->getOperand(1);
12856 MVT SVT = VT.getScalarType();
12857 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12858 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12859 SDValue NewAddr =
12861
12862 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12863 // than MOVDDUP.
12864 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12865 if (Opcode == X86ISD::VBROADCAST) {
12866 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12867 SDValue Ops[] = {Ld->getChain(), NewAddr};
12868 V = DAG.getMemIntrinsicNode(
12869 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12871 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12873 return DAG.getBitcast(VT, V);
12874 }
12875 assert(SVT == MVT::f64 && "Unexpected VT!");
12876 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12878 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12880 } else if (!BroadcastFromReg) {
12881 // We can't broadcast from a vector register.
12882 return SDValue();
12883 } else if (BitOffset != 0) {
12884 // We can only broadcast from the zero-element of a vector register,
12885 // but it can be advantageous to broadcast from the zero-element of a
12886 // subvector.
12887 if (!VT.is256BitVector() && !VT.is512BitVector())
12888 return SDValue();
12889
12890 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12891 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12892 return SDValue();
12893
12894 // If we are broadcasting an element from the lowest 128-bit subvector, try
12895 // to move the element in position.
12896 if (BitOffset < 128 && NumActiveElts > 1 &&
12897 V.getScalarValueSizeInBits() == NumEltBits) {
12898 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12899 "Unexpected bit-offset");
12900 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
12901 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12902 V = extractSubVector(V, 0, DAG, DL, 128);
12903 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
12904 } else {
12905 // Only broadcast the zero-element of a 128-bit subvector.
12906 if ((BitOffset % 128) != 0)
12907 return SDValue();
12908
12909 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12910 "Unexpected bit-offset");
12911 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12912 "Unexpected vector size");
12913 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12914 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12915 }
12916 }
12917
12918 // On AVX we can use VBROADCAST directly for scalar sources.
12919 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12920 V = DAG.getBitcast(MVT::f64, V);
12921 if (Subtarget.hasAVX()) {
12922 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12923 return DAG.getBitcast(VT, V);
12924 }
12925 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12926 }
12927
12928 // If this is a scalar, do the broadcast on this type and bitcast.
12929 if (!V.getValueType().isVector()) {
12930 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12931 "Unexpected scalar size");
12932 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12934 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12935 }
12936
12937 // We only support broadcasting from 128-bit vectors to minimize the
12938 // number of patterns we need to deal with in isel. So extract down to
12939 // 128-bits, removing as many bitcasts as possible.
12940 if (V.getValueSizeInBits() > 128)
12942
12943 // Otherwise cast V to a vector with the same element type as VT, but
12944 // possibly narrower than VT. Then perform the broadcast.
12945 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12946 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12947 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12948}
12949
12950// Check for whether we can use INSERTPS to perform the shuffle. We only use
12951// INSERTPS when the V1 elements are already in the correct locations
12952// because otherwise we can just always use two SHUFPS instructions which
12953// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12954// perform INSERTPS if a single V1 element is out of place and all V2
12955// elements are zeroable.
12957 unsigned &InsertPSMask,
12958 const APInt &Zeroable,
12959 ArrayRef<int> Mask, SelectionDAG &DAG) {
12960 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12961 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12962 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12963
12964 // Attempt to match INSERTPS with one element from VA or VB being
12965 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12966 // are updated.
12967 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12968 ArrayRef<int> CandidateMask) {
12969 unsigned ZMask = 0;
12970 int VADstIndex = -1;
12971 int VBDstIndex = -1;
12972 bool VAUsedInPlace = false;
12973
12974 for (int i = 0; i < 4; ++i) {
12975 // Synthesize a zero mask from the zeroable elements (includes undefs).
12976 if (Zeroable[i]) {
12977 ZMask |= 1 << i;
12978 continue;
12979 }
12980
12981 // Flag if we use any VA inputs in place.
12982 if (i == CandidateMask[i]) {
12983 VAUsedInPlace = true;
12984 continue;
12985 }
12986
12987 // We can only insert a single non-zeroable element.
12988 if (VADstIndex >= 0 || VBDstIndex >= 0)
12989 return false;
12990
12991 if (CandidateMask[i] < 4) {
12992 // VA input out of place for insertion.
12993 VADstIndex = i;
12994 } else {
12995 // VB input for insertion.
12996 VBDstIndex = i;
12997 }
12998 }
12999
13000 // Don't bother if we have no (non-zeroable) element for insertion.
13001 if (VADstIndex < 0 && VBDstIndex < 0)
13002 return false;
13003
13004 // Determine element insertion src/dst indices. The src index is from the
13005 // start of the inserted vector, not the start of the concatenated vector.
13006 unsigned VBSrcIndex = 0;
13007 if (VADstIndex >= 0) {
13008 // If we have a VA input out of place, we use VA as the V2 element
13009 // insertion and don't use the original V2 at all.
13010 VBSrcIndex = CandidateMask[VADstIndex];
13011 VBDstIndex = VADstIndex;
13012 VB = VA;
13013 } else {
13014 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13015 }
13016
13017 // If no V1 inputs are used in place, then the result is created only from
13018 // the zero mask and the V2 insertion - so remove V1 dependency.
13019 if (!VAUsedInPlace)
13020 VA = DAG.getUNDEF(MVT::v4f32);
13021
13022 // Update V1, V2 and InsertPSMask accordingly.
13023 V1 = VA;
13024 V2 = VB;
13025
13026 // Insert the V2 element into the desired position.
13027 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13028 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13029 return true;
13030 };
13031
13032 if (matchAsInsertPS(V1, V2, Mask))
13033 return true;
13034
13035 // Commute and try again.
13036 SmallVector<int, 4> CommutedMask(Mask);
13038 if (matchAsInsertPS(V2, V1, CommutedMask))
13039 return true;
13040
13041 return false;
13042}
13043
13045 ArrayRef<int> Mask, const APInt &Zeroable,
13046 SelectionDAG &DAG) {
13047 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13048 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13049
13050 // Attempt to match the insertps pattern.
13051 unsigned InsertPSMask = 0;
13052 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13053 return SDValue();
13054
13055 // Insert the V2 element into the desired position.
13056 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13057 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13058}
13059
13060/// Handle lowering of 2-lane 64-bit floating point shuffles.
13061///
13062/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13063/// support for floating point shuffles but not integer shuffles. These
13064/// instructions will incur a domain crossing penalty on some chips though so
13065/// it is better to avoid lowering through this for integer vectors where
13066/// possible.
13068 const APInt &Zeroable, SDValue V1, SDValue V2,
13069 const X86Subtarget &Subtarget,
13070 SelectionDAG &DAG) {
13071 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13072 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13073 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13074
13075 if (V2.isUndef()) {
13076 // Check for being able to broadcast a single element.
13077 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13078 Mask, Subtarget, DAG))
13079 return Broadcast;
13080
13081 // Straight shuffle of a single input vector. Simulate this by using the
13082 // single input as both of the "inputs" to this instruction..
13083 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13084
13085 if (Subtarget.hasAVX()) {
13086 // If we have AVX, we can use VPERMILPS which will allow folding a load
13087 // into the shuffle.
13088 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13089 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13090 }
13091
13092 return DAG.getNode(
13093 X86ISD::SHUFP, DL, MVT::v2f64,
13094 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13095 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13096 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13097 }
13098 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13099 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13100 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13101 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13102
13103 if (Subtarget.hasAVX2())
13104 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13105 return Extract;
13106
13107 // When loading a scalar and then shuffling it into a vector we can often do
13108 // the insertion cheaply.
13110 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13111 return Insertion;
13112 // Try inverting the insertion since for v2 masks it is easy to do and we
13113 // can't reliably sort the mask one way or the other.
13114 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13115 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13117 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13118 return Insertion;
13119
13120 // Try to use one of the special instruction patterns to handle two common
13121 // blend patterns if a zero-blend above didn't work.
13122 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13123 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13124 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13125 // We can either use a special instruction to load over the low double or
13126 // to move just the low double.
13127 return DAG.getNode(
13128 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13129 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13130
13131 if (Subtarget.hasSSE41())
13132 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13133 Zeroable, Subtarget, DAG))
13134 return Blend;
13135
13136 // Use dedicated unpack instructions for masks that match their pattern.
13137 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13138 return V;
13139
13140 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13141 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13142 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13143}
13144
13145/// Handle lowering of 2-lane 64-bit integer shuffles.
13146///
13147/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13148/// the integer unit to minimize domain crossing penalties. However, for blends
13149/// it falls back to the floating point shuffle operation with appropriate bit
13150/// casting.
13152 const APInt &Zeroable, SDValue V1, SDValue V2,
13153 const X86Subtarget &Subtarget,
13154 SelectionDAG &DAG) {
13155 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13156 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13157 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13158
13159 if (V2.isUndef()) {
13160 // Check for being able to broadcast a single element.
13161 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13162 Mask, Subtarget, DAG))
13163 return Broadcast;
13164
13165 // Straight shuffle of a single input vector. For everything from SSE2
13166 // onward this has a single fast instruction with no scary immediates.
13167 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13168 V1 = DAG.getBitcast(MVT::v4i32, V1);
13169 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13170 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13171 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13172 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13173 return DAG.getBitcast(
13174 MVT::v2i64,
13175 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13176 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13177 }
13178 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13179 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13180 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13181 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13182
13183 if (Subtarget.hasAVX2())
13184 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13185 return Extract;
13186
13187 // Try to use shift instructions.
13188 if (SDValue Shift =
13189 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13190 DAG, /*BitwiseOnly*/ false))
13191 return Shift;
13192
13193 // When loading a scalar and then shuffling it into a vector we can often do
13194 // the insertion cheaply.
13196 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13197 return Insertion;
13198 // Try inverting the insertion since for v2 masks it is easy to do and we
13199 // can't reliably sort the mask one way or the other.
13200 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13202 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13203 return Insertion;
13204
13205 // We have different paths for blend lowering, but they all must use the
13206 // *exact* same predicate.
13207 bool IsBlendSupported = Subtarget.hasSSE41();
13208 if (IsBlendSupported)
13209 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13210 Zeroable, Subtarget, DAG))
13211 return Blend;
13212
13213 // Use dedicated unpack instructions for masks that match their pattern.
13214 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13215 return V;
13216
13217 // Try to use byte rotation instructions.
13218 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13219 if (Subtarget.hasSSSE3()) {
13220 if (Subtarget.hasVLX())
13221 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13222 Zeroable, Subtarget, DAG))
13223 return Rotate;
13224
13225 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13226 Subtarget, DAG))
13227 return Rotate;
13228 }
13229
13230 // If we have direct support for blends, we should lower by decomposing into
13231 // a permute. That will be faster than the domain cross.
13232 if (IsBlendSupported)
13233 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13234 Zeroable, Subtarget, DAG);
13235
13236 // We implement this with SHUFPD which is pretty lame because it will likely
13237 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13238 // However, all the alternatives are still more cycles and newer chips don't
13239 // have this problem. It would be really nice if x86 had better shuffles here.
13240 V1 = DAG.getBitcast(MVT::v2f64, V1);
13241 V2 = DAG.getBitcast(MVT::v2f64, V2);
13242 return DAG.getBitcast(MVT::v2i64,
13243 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13244}
13245
13246/// Lower a vector shuffle using the SHUFPS instruction.
13247///
13248/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13249/// It makes no assumptions about whether this is the *best* lowering, it simply
13250/// uses it.
13252 ArrayRef<int> Mask, SDValue V1,
13253 SDValue V2, SelectionDAG &DAG) {
13254 SDValue LowV = V1, HighV = V2;
13255 SmallVector<int, 4> NewMask(Mask);
13256 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13257
13258 if (NumV2Elements == 1) {
13259 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13260
13261 // Compute the index adjacent to V2Index and in the same half by toggling
13262 // the low bit.
13263 int V2AdjIndex = V2Index ^ 1;
13264
13265 if (Mask[V2AdjIndex] < 0) {
13266 // Handles all the cases where we have a single V2 element and an undef.
13267 // This will only ever happen in the high lanes because we commute the
13268 // vector otherwise.
13269 if (V2Index < 2)
13270 std::swap(LowV, HighV);
13271 NewMask[V2Index] -= 4;
13272 } else {
13273 // Handle the case where the V2 element ends up adjacent to a V1 element.
13274 // To make this work, blend them together as the first step.
13275 int V1Index = V2AdjIndex;
13276 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13277 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13278 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13279
13280 // Now proceed to reconstruct the final blend as we have the necessary
13281 // high or low half formed.
13282 if (V2Index < 2) {
13283 LowV = V2;
13284 HighV = V1;
13285 } else {
13286 HighV = V2;
13287 }
13288 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13289 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13290 }
13291 } else if (NumV2Elements == 2) {
13292 if (Mask[0] < 4 && Mask[1] < 4) {
13293 // Handle the easy case where we have V1 in the low lanes and V2 in the
13294 // high lanes.
13295 NewMask[2] -= 4;
13296 NewMask[3] -= 4;
13297 } else if (Mask[2] < 4 && Mask[3] < 4) {
13298 // We also handle the reversed case because this utility may get called
13299 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13300 // arrange things in the right direction.
13301 NewMask[0] -= 4;
13302 NewMask[1] -= 4;
13303 HighV = V1;
13304 LowV = V2;
13305 } else {
13306 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13307 // trying to place elements directly, just blend them and set up the final
13308 // shuffle to place them.
13309
13310 // The first two blend mask elements are for V1, the second two are for
13311 // V2.
13312 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13313 Mask[2] < 4 ? Mask[2] : Mask[3],
13314 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13315 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13316 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13317 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13318
13319 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13320 // a blend.
13321 LowV = HighV = V1;
13322 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13323 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13324 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13325 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13326 }
13327 } else if (NumV2Elements == 3) {
13328 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13329 // we can get here due to other paths (e.g repeated mask matching) that we
13330 // don't want to do another round of lowerVECTOR_SHUFFLE.
13332 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13333 }
13334 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13335 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13336}
13337
13338/// Lower 4-lane 32-bit floating point shuffles.
13339///
13340/// Uses instructions exclusively from the floating point unit to minimize
13341/// domain crossing penalties, as these are sufficient to implement all v4f32
13342/// shuffles.
13344 const APInt &Zeroable, SDValue V1, SDValue V2,
13345 const X86Subtarget &Subtarget,
13346 SelectionDAG &DAG) {
13347 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13348 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13349 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13350
13351 if (Subtarget.hasSSE41())
13352 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13353 Zeroable, Subtarget, DAG))
13354 return Blend;
13355
13356 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13357
13358 if (NumV2Elements == 0) {
13359 // Check for being able to broadcast a single element.
13360 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13361 Mask, Subtarget, DAG))
13362 return Broadcast;
13363
13364 // Use even/odd duplicate instructions for masks that match their pattern.
13365 if (Subtarget.hasSSE3()) {
13366 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13367 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13368 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13369 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13370 }
13371
13372 if (Subtarget.hasAVX()) {
13373 // If we have AVX, we can use VPERMILPS which will allow folding a load
13374 // into the shuffle.
13375 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13376 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13377 }
13378
13379 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13380 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13381 if (!Subtarget.hasSSE2()) {
13382 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13383 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13384 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13385 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13386 }
13387
13388 // Otherwise, use a straight shuffle of a single input vector. We pass the
13389 // input vector to both operands to simulate this with a SHUFPS.
13390 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13391 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13392 }
13393
13394 if (Subtarget.hasSSE2())
13396 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13397 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13398 return ZExt;
13399 }
13400
13401 if (Subtarget.hasAVX2())
13402 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13403 return Extract;
13404
13405 // There are special ways we can lower some single-element blends. However, we
13406 // have custom ways we can lower more complex single-element blends below that
13407 // we defer to if both this and BLENDPS fail to match, so restrict this to
13408 // when the V2 input is targeting element 0 of the mask -- that is the fast
13409 // case here.
13410 if (NumV2Elements == 1 && Mask[0] >= 4)
13412 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13413 return V;
13414
13415 if (Subtarget.hasSSE41()) {
13416 // Use INSERTPS if we can complete the shuffle efficiently.
13417 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13418 return V;
13419
13420 if (!isSingleSHUFPSMask(Mask))
13421 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13422 V2, Mask, DAG))
13423 return BlendPerm;
13424 }
13425
13426 // Use low/high mov instructions. These are only valid in SSE1 because
13427 // otherwise they are widened to v2f64 and never get here.
13428 if (!Subtarget.hasSSE2()) {
13429 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13430 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13431 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13432 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13433 }
13434
13435 // Use dedicated unpack instructions for masks that match their pattern.
13436 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13437 return V;
13438
13439 // Otherwise fall back to a SHUFPS lowering strategy.
13440 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13441}
13442
13443/// Lower 4-lane i32 vector shuffles.
13444///
13445/// We try to handle these with integer-domain shuffles where we can, but for
13446/// blends we use the floating point domain blend instructions.
13448 const APInt &Zeroable, SDValue V1, SDValue V2,
13449 const X86Subtarget &Subtarget,
13450 SelectionDAG &DAG) {
13451 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13452 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13453 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13454
13455 // Whenever we can lower this as a zext, that instruction is strictly faster
13456 // than any alternative. It also allows us to fold memory operands into the
13457 // shuffle in many cases.
13458 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13459 Zeroable, Subtarget, DAG))
13460 return ZExt;
13461
13462 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13463
13464 // Try to use shift instructions if fast.
13465 if (Subtarget.preferLowerShuffleAsShift()) {
13466 if (SDValue Shift =
13467 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13468 Subtarget, DAG, /*BitwiseOnly*/ true))
13469 return Shift;
13470 if (NumV2Elements == 0)
13471 if (SDValue Rotate =
13472 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13473 return Rotate;
13474 }
13475
13476 if (NumV2Elements == 0) {
13477 // Try to use broadcast unless the mask only has one non-undef element.
13478 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13479 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13480 Mask, Subtarget, DAG))
13481 return Broadcast;
13482 }
13483
13484 // Straight shuffle of a single input vector. For everything from SSE2
13485 // onward this has a single fast instruction with no scary immediates.
13486 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13487 // but we aren't actually going to use the UNPCK instruction because doing
13488 // so prevents folding a load into this instruction or making a copy.
13489 const int UnpackLoMask[] = {0, 0, 1, 1};
13490 const int UnpackHiMask[] = {2, 2, 3, 3};
13491 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13492 Mask = UnpackLoMask;
13493 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13494 Mask = UnpackHiMask;
13495
13496 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13497 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13498 }
13499
13500 if (Subtarget.hasAVX2())
13501 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13502 return Extract;
13503
13504 // Try to use shift instructions.
13505 if (SDValue Shift =
13506 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13507 DAG, /*BitwiseOnly*/ false))
13508 return Shift;
13509
13510 // There are special ways we can lower some single-element blends.
13511 if (NumV2Elements == 1)
13513 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13514 return V;
13515
13516 // We have different paths for blend lowering, but they all must use the
13517 // *exact* same predicate.
13518 bool IsBlendSupported = Subtarget.hasSSE41();
13519 if (IsBlendSupported)
13520 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13521 Zeroable, Subtarget, DAG))
13522 return Blend;
13523
13524 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13525 Zeroable, Subtarget, DAG))
13526 return Masked;
13527
13528 // Use dedicated unpack instructions for masks that match their pattern.
13529 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13530 return V;
13531
13532 // Try to use byte rotation instructions.
13533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13534 if (Subtarget.hasSSSE3()) {
13535 if (Subtarget.hasVLX())
13536 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13537 Zeroable, Subtarget, DAG))
13538 return Rotate;
13539
13540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13541 Subtarget, DAG))
13542 return Rotate;
13543 }
13544
13545 // Assume that a single SHUFPS is faster than an alternative sequence of
13546 // multiple instructions (even if the CPU has a domain penalty).
13547 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13548 if (!isSingleSHUFPSMask(Mask)) {
13549 // If we have direct support for blends, we should lower by decomposing into
13550 // a permute. That will be faster than the domain cross.
13551 if (IsBlendSupported)
13552 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13553 Zeroable, Subtarget, DAG);
13554
13555 // Try to lower by permuting the inputs into an unpack instruction.
13556 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13557 Mask, Subtarget, DAG))
13558 return Unpack;
13559 }
13560
13561 // We implement this with SHUFPS because it can blend from two vectors.
13562 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13563 // up the inputs, bypassing domain shift penalties that we would incur if we
13564 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13565 // relevant.
13566 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13567 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13568 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13569 return DAG.getBitcast(MVT::v4i32, ShufPS);
13570}
13571
13572/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13573/// shuffle lowering, and the most complex part.
13574///
13575/// The lowering strategy is to try to form pairs of input lanes which are
13576/// targeted at the same half of the final vector, and then use a dword shuffle
13577/// to place them onto the right half, and finally unpack the paired lanes into
13578/// their final position.
13579///
13580/// The exact breakdown of how to form these dword pairs and align them on the
13581/// correct sides is really tricky. See the comments within the function for
13582/// more of the details.
13583///
13584/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13585/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13586/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13587/// vector, form the analogous 128-bit 8-element Mask.
13589 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13590 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13591 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13592 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13593
13594 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13595 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13596 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13597
13598 // Attempt to directly match PSHUFLW or PSHUFHW.
13599 if (isUndefOrInRange(LoMask, 0, 4) &&
13600 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13601 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13602 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13603 }
13604 if (isUndefOrInRange(HiMask, 4, 8) &&
13605 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13606 for (int i = 0; i != 4; ++i)
13607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13608 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13609 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13610 }
13611
13612 SmallVector<int, 4> LoInputs;
13613 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13614 array_pod_sort(LoInputs.begin(), LoInputs.end());
13615 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13616 SmallVector<int, 4> HiInputs;
13617 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13618 array_pod_sort(HiInputs.begin(), HiInputs.end());
13619 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13620 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13621 int NumHToL = LoInputs.size() - NumLToL;
13622 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13623 int NumHToH = HiInputs.size() - NumLToH;
13624 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13625 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13626 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13627 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13628
13629 // If we are shuffling values from one half - check how many different DWORD
13630 // pairs we need to create. If only 1 or 2 then we can perform this as a
13631 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13632 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13633 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13634 V = DAG.getNode(ShufWOp, DL, VT, V,
13635 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13636 V = DAG.getBitcast(PSHUFDVT, V);
13637 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13638 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13639 return DAG.getBitcast(VT, V);
13640 };
13641
13642 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13644 SmallVector<std::pair<int, int>, 4> DWordPairs;
13645 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13646
13647 // Collect the different DWORD pairs.
13648 for (int DWord = 0; DWord != 4; ++DWord) {
13649 int M0 = Mask[2 * DWord + 0];
13650 int M1 = Mask[2 * DWord + 1];
13651 M0 = (M0 >= 0 ? M0 % 4 : M0);
13652 M1 = (M1 >= 0 ? M1 % 4 : M1);
13653 if (M0 < 0 && M1 < 0)
13654 continue;
13655
13656 bool Match = false;
13657 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13658 auto &DWordPair = DWordPairs[j];
13659 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13660 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13661 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13662 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13663 PSHUFDMask[DWord] = DOffset + j;
13664 Match = true;
13665 break;
13666 }
13667 }
13668 if (!Match) {
13669 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13670 DWordPairs.push_back(std::make_pair(M0, M1));
13671 }
13672 }
13673
13674 if (DWordPairs.size() <= 2) {
13675 DWordPairs.resize(2, std::make_pair(-1, -1));
13676 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13677 DWordPairs[1].first, DWordPairs[1].second};
13678 if ((NumHToL + NumHToH) == 0)
13679 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13680 if ((NumLToL + NumLToH) == 0)
13681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13682 }
13683 }
13684
13685 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13686 // such inputs we can swap two of the dwords across the half mark and end up
13687 // with <=2 inputs to each half in each half. Once there, we can fall through
13688 // to the generic code below. For example:
13689 //
13690 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13691 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13692 //
13693 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13694 // and an existing 2-into-2 on the other half. In this case we may have to
13695 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13696 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13697 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13698 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13699 // half than the one we target for fixing) will be fixed when we re-enter this
13700 // path. We will also combine away any sequence of PSHUFD instructions that
13701 // result into a single instruction. Here is an example of the tricky case:
13702 //
13703 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13704 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13705 //
13706 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13707 //
13708 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13709 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13710 //
13711 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13712 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13713 //
13714 // The result is fine to be handled by the generic logic.
13715 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13716 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13717 int AOffset, int BOffset) {
13718 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13719 "Must call this with A having 3 or 1 inputs from the A half.");
13720 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13721 "Must call this with B having 1 or 3 inputs from the B half.");
13722 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13723 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13724
13725 bool ThreeAInputs = AToAInputs.size() == 3;
13726
13727 // Compute the index of dword with only one word among the three inputs in
13728 // a half by taking the sum of the half with three inputs and subtracting
13729 // the sum of the actual three inputs. The difference is the remaining
13730 // slot.
13731 int ADWord = 0, BDWord = 0;
13732 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13733 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13734 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13735 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13736 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13737 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13738 int TripleNonInputIdx =
13739 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13740 TripleDWord = TripleNonInputIdx / 2;
13741
13742 // We use xor with one to compute the adjacent DWord to whichever one the
13743 // OneInput is in.
13744 OneInputDWord = (OneInput / 2) ^ 1;
13745
13746 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13747 // and BToA inputs. If there is also such a problem with the BToB and AToB
13748 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13749 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13750 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13751 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13752 // Compute how many inputs will be flipped by swapping these DWords. We
13753 // need
13754 // to balance this to ensure we don't form a 3-1 shuffle in the other
13755 // half.
13756 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13757 llvm::count(AToBInputs, 2 * ADWord + 1);
13758 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13759 llvm::count(BToBInputs, 2 * BDWord + 1);
13760 if ((NumFlippedAToBInputs == 1 &&
13761 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13762 (NumFlippedBToBInputs == 1 &&
13763 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13764 // We choose whether to fix the A half or B half based on whether that
13765 // half has zero flipped inputs. At zero, we may not be able to fix it
13766 // with that half. We also bias towards fixing the B half because that
13767 // will more commonly be the high half, and we have to bias one way.
13768 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13769 ArrayRef<int> Inputs) {
13770 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13771 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13772 // Determine whether the free index is in the flipped dword or the
13773 // unflipped dword based on where the pinned index is. We use this bit
13774 // in an xor to conditionally select the adjacent dword.
13775 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13776 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13777 if (IsFixIdxInput == IsFixFreeIdxInput)
13778 FixFreeIdx += 1;
13779 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13780 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13781 "We need to be changing the number of flipped inputs!");
13782 int PSHUFHalfMask[] = {0, 1, 2, 3};
13783 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13784 V = DAG.getNode(
13785 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13786 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13787 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13788
13789 for (int &M : Mask)
13790 if (M >= 0 && M == FixIdx)
13791 M = FixFreeIdx;
13792 else if (M >= 0 && M == FixFreeIdx)
13793 M = FixIdx;
13794 };
13795 if (NumFlippedBToBInputs != 0) {
13796 int BPinnedIdx =
13797 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13798 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13799 } else {
13800 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13801 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13802 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13803 }
13804 }
13805 }
13806
13807 int PSHUFDMask[] = {0, 1, 2, 3};
13808 PSHUFDMask[ADWord] = BDWord;
13809 PSHUFDMask[BDWord] = ADWord;
13810 V = DAG.getBitcast(
13811 VT,
13812 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13813 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13814
13815 // Adjust the mask to match the new locations of A and B.
13816 for (int &M : Mask)
13817 if (M >= 0 && M/2 == ADWord)
13818 M = 2 * BDWord + M % 2;
13819 else if (M >= 0 && M/2 == BDWord)
13820 M = 2 * ADWord + M % 2;
13821
13822 // Recurse back into this routine to re-compute state now that this isn't
13823 // a 3 and 1 problem.
13824 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13825 };
13826 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13827 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13828 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13829 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13830
13831 // At this point there are at most two inputs to the low and high halves from
13832 // each half. That means the inputs can always be grouped into dwords and
13833 // those dwords can then be moved to the correct half with a dword shuffle.
13834 // We use at most one low and one high word shuffle to collect these paired
13835 // inputs into dwords, and finally a dword shuffle to place them.
13836 int PSHUFLMask[4] = {-1, -1, -1, -1};
13837 int PSHUFHMask[4] = {-1, -1, -1, -1};
13838 int PSHUFDMask[4] = {-1, -1, -1, -1};
13839
13840 // First fix the masks for all the inputs that are staying in their
13841 // original halves. This will then dictate the targets of the cross-half
13842 // shuffles.
13843 auto fixInPlaceInputs =
13844 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13845 MutableArrayRef<int> SourceHalfMask,
13846 MutableArrayRef<int> HalfMask, int HalfOffset) {
13847 if (InPlaceInputs.empty())
13848 return;
13849 if (InPlaceInputs.size() == 1) {
13850 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13851 InPlaceInputs[0] - HalfOffset;
13852 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13853 return;
13854 }
13855 if (IncomingInputs.empty()) {
13856 // Just fix all of the in place inputs.
13857 for (int Input : InPlaceInputs) {
13858 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13859 PSHUFDMask[Input / 2] = Input / 2;
13860 }
13861 return;
13862 }
13863
13864 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13865 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13866 InPlaceInputs[0] - HalfOffset;
13867 // Put the second input next to the first so that they are packed into
13868 // a dword. We find the adjacent index by toggling the low bit.
13869 int AdjIndex = InPlaceInputs[0] ^ 1;
13870 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13871 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13872 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13873 };
13874 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13875 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13876
13877 // Now gather the cross-half inputs and place them into a free dword of
13878 // their target half.
13879 // FIXME: This operation could almost certainly be simplified dramatically to
13880 // look more like the 3-1 fixing operation.
13881 auto moveInputsToRightHalf = [&PSHUFDMask](
13882 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13883 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13884 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13885 int DestOffset) {
13886 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13887 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13888 };
13889 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13890 int Word) {
13891 int LowWord = Word & ~1;
13892 int HighWord = Word | 1;
13893 return isWordClobbered(SourceHalfMask, LowWord) ||
13894 isWordClobbered(SourceHalfMask, HighWord);
13895 };
13896
13897 if (IncomingInputs.empty())
13898 return;
13899
13900 if (ExistingInputs.empty()) {
13901 // Map any dwords with inputs from them into the right half.
13902 for (int Input : IncomingInputs) {
13903 // If the source half mask maps over the inputs, turn those into
13904 // swaps and use the swapped lane.
13905 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13906 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13907 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13908 Input - SourceOffset;
13909 // We have to swap the uses in our half mask in one sweep.
13910 for (int &M : HalfMask)
13911 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13912 M = Input;
13913 else if (M == Input)
13914 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13915 } else {
13916 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13917 Input - SourceOffset &&
13918 "Previous placement doesn't match!");
13919 }
13920 // Note that this correctly re-maps both when we do a swap and when
13921 // we observe the other side of the swap above. We rely on that to
13922 // avoid swapping the members of the input list directly.
13923 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13924 }
13925
13926 // Map the input's dword into the correct half.
13927 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13928 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13929 else
13930 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13931 Input / 2 &&
13932 "Previous placement doesn't match!");
13933 }
13934
13935 // And just directly shift any other-half mask elements to be same-half
13936 // as we will have mirrored the dword containing the element into the
13937 // same position within that half.
13938 for (int &M : HalfMask)
13939 if (M >= SourceOffset && M < SourceOffset + 4) {
13940 M = M - SourceOffset + DestOffset;
13941 assert(M >= 0 && "This should never wrap below zero!");
13942 }
13943 return;
13944 }
13945
13946 // Ensure we have the input in a viable dword of its current half. This
13947 // is particularly tricky because the original position may be clobbered
13948 // by inputs being moved and *staying* in that half.
13949 if (IncomingInputs.size() == 1) {
13950 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13951 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13952 SourceOffset;
13953 SourceHalfMask[InputFixed - SourceOffset] =
13954 IncomingInputs[0] - SourceOffset;
13955 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13956 InputFixed);
13957 IncomingInputs[0] = InputFixed;
13958 }
13959 } else if (IncomingInputs.size() == 2) {
13960 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13961 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13962 // We have two non-adjacent or clobbered inputs we need to extract from
13963 // the source half. To do this, we need to map them into some adjacent
13964 // dword slot in the source mask.
13965 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13966 IncomingInputs[1] - SourceOffset};
13967
13968 // If there is a free slot in the source half mask adjacent to one of
13969 // the inputs, place the other input in it. We use (Index XOR 1) to
13970 // compute an adjacent index.
13971 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13972 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13973 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13974 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13975 InputsFixed[1] = InputsFixed[0] ^ 1;
13976 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13977 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13978 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13979 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13980 InputsFixed[0] = InputsFixed[1] ^ 1;
13981 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13982 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13983 // The two inputs are in the same DWord but it is clobbered and the
13984 // adjacent DWord isn't used at all. Move both inputs to the free
13985 // slot.
13986 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13987 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13988 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13989 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13990 } else {
13991 // The only way we hit this point is if there is no clobbering
13992 // (because there are no off-half inputs to this half) and there is no
13993 // free slot adjacent to one of the inputs. In this case, we have to
13994 // swap an input with a non-input.
13995 for (int i = 0; i < 4; ++i)
13996 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13997 "We can't handle any clobbers here!");
13998 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13999 "Cannot have adjacent inputs here!");
14000
14001 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14002 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14003
14004 // We also have to update the final source mask in this case because
14005 // it may need to undo the above swap.
14006 for (int &M : FinalSourceHalfMask)
14007 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14008 M = InputsFixed[1] + SourceOffset;
14009 else if (M == InputsFixed[1] + SourceOffset)
14010 M = (InputsFixed[0] ^ 1) + SourceOffset;
14011
14012 InputsFixed[1] = InputsFixed[0] ^ 1;
14013 }
14014
14015 // Point everything at the fixed inputs.
14016 for (int &M : HalfMask)
14017 if (M == IncomingInputs[0])
14018 M = InputsFixed[0] + SourceOffset;
14019 else if (M == IncomingInputs[1])
14020 M = InputsFixed[1] + SourceOffset;
14021
14022 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14023 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14024 }
14025 } else {
14026 llvm_unreachable("Unhandled input size!");
14027 }
14028
14029 // Now hoist the DWord down to the right half.
14030 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14031 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14032 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14033 for (int &M : HalfMask)
14034 for (int Input : IncomingInputs)
14035 if (M == Input)
14036 M = FreeDWord * 2 + Input % 2;
14037 };
14038 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14039 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14040 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14041 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14042
14043 // Now enact all the shuffles we've computed to move the inputs into their
14044 // target half.
14045 if (!isNoopShuffleMask(PSHUFLMask))
14046 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14047 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14048 if (!isNoopShuffleMask(PSHUFHMask))
14049 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14050 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14051 if (!isNoopShuffleMask(PSHUFDMask))
14052 V = DAG.getBitcast(
14053 VT,
14054 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14055 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14056
14057 // At this point, each half should contain all its inputs, and we can then
14058 // just shuffle them into their final position.
14059 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
14060 "Failed to lift all the high half inputs to the low mask!");
14061 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
14062 "Failed to lift all the low half inputs to the high mask!");
14063
14064 // Do a half shuffle for the low mask.
14065 if (!isNoopShuffleMask(LoMask))
14066 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14067 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14068
14069 // Do a half shuffle with the high mask after shifting its values down.
14070 for (int &M : HiMask)
14071 if (M >= 0)
14072 M -= 4;
14073 if (!isNoopShuffleMask(HiMask))
14074 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14075 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14076
14077 return V;
14078}
14079
14080/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14081/// blend if only one input is used.
14083 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14084 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14086 "Lane crossing shuffle masks not supported");
14087
14088 int NumBytes = VT.getSizeInBits() / 8;
14089 int Size = Mask.size();
14090 int Scale = NumBytes / Size;
14091
14092 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14093 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14094 V1InUse = false;
14095 V2InUse = false;
14096
14097 for (int i = 0; i < NumBytes; ++i) {
14098 int M = Mask[i / Scale];
14099 if (M < 0)
14100 continue;
14101
14102 const int ZeroMask = 0x80;
14103 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14104 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14105 if (Zeroable[i / Scale])
14106 V1Idx = V2Idx = ZeroMask;
14107
14108 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14109 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14110 V1InUse |= (ZeroMask != V1Idx);
14111 V2InUse |= (ZeroMask != V2Idx);
14112 }
14113
14114 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14115 if (V1InUse)
14116 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14117 DAG.getBuildVector(ShufVT, DL, V1Mask));
14118 if (V2InUse)
14119 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14120 DAG.getBuildVector(ShufVT, DL, V2Mask));
14121
14122 // If we need shuffled inputs from both, blend the two.
14123 SDValue V;
14124 if (V1InUse && V2InUse)
14125 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14126 else
14127 V = V1InUse ? V1 : V2;
14128
14129 // Cast the result back to the correct type.
14130 return DAG.getBitcast(VT, V);
14131}
14132
14133/// Generic lowering of 8-lane i16 shuffles.
14134///
14135/// This handles both single-input shuffles and combined shuffle/blends with
14136/// two inputs. The single input shuffles are immediately delegated to
14137/// a dedicated lowering routine.
14138///
14139/// The blends are lowered in one of three fundamental ways. If there are few
14140/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14141/// of the input is significantly cheaper when lowered as an interleaving of
14142/// the two inputs, try to interleave them. Otherwise, blend the low and high
14143/// halves of the inputs separately (making them have relatively few inputs)
14144/// and then concatenate them.
14146 const APInt &Zeroable, SDValue V1, SDValue V2,
14147 const X86Subtarget &Subtarget,
14148 SelectionDAG &DAG) {
14149 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14150 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14151 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14152
14153 // Whenever we can lower this as a zext, that instruction is strictly faster
14154 // than any alternative.
14155 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14156 Zeroable, Subtarget, DAG))
14157 return ZExt;
14158
14159 // Try to use lower using a truncation.
14160 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14161 Subtarget, DAG))
14162 return V;
14163
14164 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14165
14166 if (NumV2Inputs == 0) {
14167 // Try to use shift instructions.
14168 if (SDValue Shift =
14169 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14170 Subtarget, DAG, /*BitwiseOnly*/ false))
14171 return Shift;
14172
14173 // Check for being able to broadcast a single element.
14174 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14175 Mask, Subtarget, DAG))
14176 return Broadcast;
14177
14178 // Try to use bit rotation instructions.
14179 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14180 Subtarget, DAG))
14181 return Rotate;
14182
14183 // Use dedicated unpack instructions for masks that match their pattern.
14184 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14185 return V;
14186
14187 // Use dedicated pack instructions for masks that match their pattern.
14188 if (SDValue V =
14189 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14190 return V;
14191
14192 // Try to use byte rotation instructions.
14193 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14194 Subtarget, DAG))
14195 return Rotate;
14196
14197 // Make a copy of the mask so it can be modified.
14198 SmallVector<int, 8> MutableMask(Mask);
14199 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14200 Subtarget, DAG);
14201 }
14202
14203 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14204 "All single-input shuffles should be canonicalized to be V1-input "
14205 "shuffles.");
14206
14207 // Try to use shift instructions.
14208 if (SDValue Shift =
14209 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14210 DAG, /*BitwiseOnly*/ false))
14211 return Shift;
14212
14213 // See if we can use SSE4A Extraction / Insertion.
14214 if (Subtarget.hasSSE4A())
14215 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14216 Zeroable, DAG))
14217 return V;
14218
14219 // There are special ways we can lower some single-element blends.
14220 if (NumV2Inputs == 1)
14222 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14223 return V;
14224
14225 // We have different paths for blend lowering, but they all must use the
14226 // *exact* same predicate.
14227 bool IsBlendSupported = Subtarget.hasSSE41();
14228 if (IsBlendSupported)
14229 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14230 Zeroable, Subtarget, DAG))
14231 return Blend;
14232
14233 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14234 Zeroable, Subtarget, DAG))
14235 return Masked;
14236
14237 // Use dedicated unpack instructions for masks that match their pattern.
14238 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14239 return V;
14240
14241 // Use dedicated pack instructions for masks that match their pattern.
14242 if (SDValue V =
14243 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14244 return V;
14245
14246 // Try to use lower using a truncation.
14247 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14248 Subtarget, DAG))
14249 return V;
14250
14251 // Try to use byte rotation instructions.
14252 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14253 Subtarget, DAG))
14254 return Rotate;
14255
14256 if (SDValue BitBlend =
14257 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14258 return BitBlend;
14259
14260 // Try to use byte shift instructions to mask.
14261 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14262 Zeroable, Subtarget, DAG))
14263 return V;
14264
14265 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14266 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14267 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14268 !Subtarget.hasVLX()) {
14269 // Check if this is part of a 256-bit vector truncation.
14270 unsigned PackOpc = 0;
14271 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14274 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14275 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14276 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14277 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14278 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14279 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14280 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14281 PackOpc = X86ISD::PACKUS;
14282 } else if (Subtarget.hasSSE41()) {
14283 SmallVector<SDValue, 4> DWordClearOps(4,
14284 DAG.getConstant(0, DL, MVT::i32));
14285 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14286 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14287 SDValue DWordClearMask =
14288 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14289 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14290 DWordClearMask);
14291 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14292 DWordClearMask);
14293 PackOpc = X86ISD::PACKUS;
14294 } else if (!Subtarget.hasSSSE3()) {
14295 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14296 V1 = DAG.getBitcast(MVT::v4i32, V1);
14297 V2 = DAG.getBitcast(MVT::v4i32, V2);
14298 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14299 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14300 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14301 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14302 PackOpc = X86ISD::PACKSS;
14303 }
14304 if (PackOpc) {
14305 // Now pack things back together.
14306 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14307 if (NumEvenDrops == 2) {
14308 Result = DAG.getBitcast(MVT::v4i32, Result);
14309 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14310 }
14311 return Result;
14312 }
14313 }
14314
14315 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14316 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14317 if (NumOddDrops == 1) {
14318 bool HasSSE41 = Subtarget.hasSSE41();
14319 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14320 DAG.getBitcast(MVT::v4i32, V1),
14321 DAG.getTargetConstant(16, DL, MVT::i8));
14322 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14323 DAG.getBitcast(MVT::v4i32, V2),
14324 DAG.getTargetConstant(16, DL, MVT::i8));
14325 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14326 MVT::v8i16, V1, V2);
14327 }
14328
14329 // Try to lower by permuting the inputs into an unpack instruction.
14330 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14331 Mask, Subtarget, DAG))
14332 return Unpack;
14333
14334 // If we can't directly blend but can use PSHUFB, that will be better as it
14335 // can both shuffle and set up the inefficient blend.
14336 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14337 bool V1InUse, V2InUse;
14338 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14339 Zeroable, DAG, V1InUse, V2InUse);
14340 }
14341
14342 // We can always bit-blend if we have to so the fallback strategy is to
14343 // decompose into single-input permutes and blends/unpacks.
14344 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14345 Zeroable, Subtarget, DAG);
14346}
14347
14348/// Lower 8-lane 16-bit floating point shuffles.
14350 const APInt &Zeroable, SDValue V1, SDValue V2,
14351 const X86Subtarget &Subtarget,
14352 SelectionDAG &DAG) {
14353 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14354 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14355 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14356 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14357
14358 if (Subtarget.hasFP16()) {
14359 if (NumV2Elements == 0) {
14360 // Check for being able to broadcast a single element.
14361 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14362 Mask, Subtarget, DAG))
14363 return Broadcast;
14364 }
14365 if (NumV2Elements == 1 && Mask[0] >= 8)
14367 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14368 return V;
14369 }
14370
14371 V1 = DAG.getBitcast(MVT::v8i16, V1);
14372 V2 = DAG.getBitcast(MVT::v8i16, V2);
14373 return DAG.getBitcast(MVT::v8f16,
14374 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14375}
14376
14377// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14378// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14379// the active subvector is extracted.
14381 ArrayRef<int> OriginalMask, SDValue V1,
14382 SDValue V2, const X86Subtarget &Subtarget,
14383 SelectionDAG &DAG) {
14384 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14385 SmallVector<int, 32> Mask(OriginalMask);
14386 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14387 !isShuffleFoldableLoad(V2)) {
14389 std::swap(V1, V2);
14390 }
14391
14392 MVT MaskVT = VT.changeTypeToInteger();
14393 SDValue MaskNode;
14394 MVT ShuffleVT = VT;
14395 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14396 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14397 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14398 ShuffleVT = V1.getSimpleValueType();
14399
14400 // Adjust mask to correct indices for the second input.
14401 int NumElts = VT.getVectorNumElements();
14402 unsigned Scale = 512 / VT.getSizeInBits();
14403 SmallVector<int, 32> AdjustedMask(Mask);
14404 for (int &M : AdjustedMask)
14405 if (NumElts <= M)
14406 M += (Scale - 1) * NumElts;
14407 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14408 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14409 } else {
14410 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14411 }
14412
14413 SDValue Result;
14414 if (V2.isUndef())
14415 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14416 else
14417 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14418
14419 if (VT != ShuffleVT)
14420 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14421
14422 return Result;
14423}
14424
14425/// Generic lowering of v16i8 shuffles.
14426///
14427/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14428/// detect any complexity reducing interleaving. If that doesn't help, it uses
14429/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14430/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14431/// back together.
14433 const APInt &Zeroable, SDValue V1, SDValue V2,
14434 const X86Subtarget &Subtarget,
14435 SelectionDAG &DAG) {
14436 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14437 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14438 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14439
14440 // Try to use shift instructions.
14441 if (SDValue Shift =
14442 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14443 DAG, /*BitwiseOnly*/ false))
14444 return Shift;
14445
14446 // Try to use byte rotation instructions.
14447 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14448 Subtarget, DAG))
14449 return Rotate;
14450
14451 // Use dedicated pack instructions for masks that match their pattern.
14452 if (SDValue V =
14453 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14454 return V;
14455
14456 // Try to use a zext lowering.
14457 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14458 Zeroable, Subtarget, DAG))
14459 return ZExt;
14460
14461 // Try to use lower using a truncation.
14462 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14463 Subtarget, DAG))
14464 return V;
14465
14466 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14467 Subtarget, DAG))
14468 return V;
14469
14470 // See if we can use SSE4A Extraction / Insertion.
14471 if (Subtarget.hasSSE4A())
14472 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14473 Zeroable, DAG))
14474 return V;
14475
14476 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14477
14478 // For single-input shuffles, there are some nicer lowering tricks we can use.
14479 if (NumV2Elements == 0) {
14480 // Check for being able to broadcast a single element.
14481 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14482 Mask, Subtarget, DAG))
14483 return Broadcast;
14484
14485 // Try to use bit rotation instructions.
14486 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14487 Subtarget, DAG))
14488 return Rotate;
14489
14490 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14491 return V;
14492
14493 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14494 // Notably, this handles splat and partial-splat shuffles more efficiently.
14495 // However, it only makes sense if the pre-duplication shuffle simplifies
14496 // things significantly. Currently, this means we need to be able to
14497 // express the pre-duplication shuffle as an i16 shuffle.
14498 //
14499 // FIXME: We should check for other patterns which can be widened into an
14500 // i16 shuffle as well.
14501 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14502 for (int i = 0; i < 16; i += 2)
14503 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14504 return false;
14505
14506 return true;
14507 };
14508 auto tryToWidenViaDuplication = [&]() -> SDValue {
14509 if (!canWidenViaDuplication(Mask))
14510 return SDValue();
14511 SmallVector<int, 4> LoInputs;
14512 copy_if(Mask, std::back_inserter(LoInputs),
14513 [](int M) { return M >= 0 && M < 8; });
14514 array_pod_sort(LoInputs.begin(), LoInputs.end());
14515 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14516 SmallVector<int, 4> HiInputs;
14517 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14518 array_pod_sort(HiInputs.begin(), HiInputs.end());
14519 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14520
14521 bool TargetLo = LoInputs.size() >= HiInputs.size();
14522 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14523 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14524
14525 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14527 for (int I : InPlaceInputs) {
14528 PreDupI16Shuffle[I/2] = I/2;
14529 LaneMap[I] = I;
14530 }
14531 int j = TargetLo ? 0 : 4, je = j + 4;
14532 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14533 // Check if j is already a shuffle of this input. This happens when
14534 // there are two adjacent bytes after we move the low one.
14535 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14536 // If we haven't yet mapped the input, search for a slot into which
14537 // we can map it.
14538 while (j < je && PreDupI16Shuffle[j] >= 0)
14539 ++j;
14540
14541 if (j == je)
14542 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14543 return SDValue();
14544
14545 // Map this input with the i16 shuffle.
14546 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14547 }
14548
14549 // Update the lane map based on the mapping we ended up with.
14550 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14551 }
14552 V1 = DAG.getBitcast(
14553 MVT::v16i8,
14554 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14555 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14556
14557 // Unpack the bytes to form the i16s that will be shuffled into place.
14558 bool EvenInUse = false, OddInUse = false;
14559 for (int i = 0; i < 16; i += 2) {
14560 EvenInUse |= (Mask[i + 0] >= 0);
14561 OddInUse |= (Mask[i + 1] >= 0);
14562 if (EvenInUse && OddInUse)
14563 break;
14564 }
14565 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14566 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14567 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14568
14569 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14570 for (int i = 0; i < 16; ++i)
14571 if (Mask[i] >= 0) {
14572 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14573 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14574 if (PostDupI16Shuffle[i / 2] < 0)
14575 PostDupI16Shuffle[i / 2] = MappedMask;
14576 else
14577 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14578 "Conflicting entries in the original shuffle!");
14579 }
14580 return DAG.getBitcast(
14581 MVT::v16i8,
14582 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14583 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14584 };
14585 if (SDValue V = tryToWidenViaDuplication())
14586 return V;
14587 }
14588
14589 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14590 Zeroable, Subtarget, DAG))
14591 return Masked;
14592
14593 // Use dedicated unpack instructions for masks that match their pattern.
14594 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14595 return V;
14596
14597 // Try to use byte shift instructions to mask.
14598 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14599 Zeroable, Subtarget, DAG))
14600 return V;
14601
14602 // Check for compaction patterns.
14603 bool IsSingleInput = V2.isUndef();
14604 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14605
14606 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14607 // with PSHUFB. It is important to do this before we attempt to generate any
14608 // blends but after all of the single-input lowerings. If the single input
14609 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14610 // want to preserve that and we can DAG combine any longer sequences into
14611 // a PSHUFB in the end. But once we start blending from multiple inputs,
14612 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14613 // and there are *very* few patterns that would actually be faster than the
14614 // PSHUFB approach because of its ability to zero lanes.
14615 //
14616 // If the mask is a binary compaction, we can more efficiently perform this
14617 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14618 //
14619 // FIXME: The only exceptions to the above are blends which are exact
14620 // interleavings with direct instructions supporting them. We currently don't
14621 // handle those well here.
14622 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14623 bool V1InUse = false;
14624 bool V2InUse = false;
14625
14627 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14628
14629 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14630 // do so. This avoids using them to handle blends-with-zero which is
14631 // important as a single pshufb is significantly faster for that.
14632 if (V1InUse && V2InUse) {
14633 if (Subtarget.hasSSE41())
14634 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14635 Zeroable, Subtarget, DAG))
14636 return Blend;
14637
14638 // We can use an unpack to do the blending rather than an or in some
14639 // cases. Even though the or may be (very minorly) more efficient, we
14640 // preference this lowering because there are common cases where part of
14641 // the complexity of the shuffles goes away when we do the final blend as
14642 // an unpack.
14643 // FIXME: It might be worth trying to detect if the unpack-feeding
14644 // shuffles will both be pshufb, in which case we shouldn't bother with
14645 // this.
14647 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14648 return Unpack;
14649
14650 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14651 if (Subtarget.hasVBMI())
14652 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14653 DAG);
14654
14655 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14656 if (Subtarget.hasXOP()) {
14657 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14658 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14659 }
14660
14661 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14662 // PALIGNR will be cheaper than the second PSHUFB+OR.
14664 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14665 return V;
14666 }
14667
14668 return PSHUFB;
14669 }
14670
14671 // There are special ways we can lower some single-element blends.
14672 if (NumV2Elements == 1)
14674 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14675 return V;
14676
14677 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14678 return Blend;
14679
14680 // Check whether a compaction lowering can be done. This handles shuffles
14681 // which take every Nth element for some even N. See the helper function for
14682 // details.
14683 //
14684 // We special case these as they can be particularly efficiently handled with
14685 // the PACKUSB instruction on x86 and they show up in common patterns of
14686 // rearranging bytes to truncate wide elements.
14687 if (NumEvenDrops) {
14688 // NumEvenDrops is the power of two stride of the elements. Another way of
14689 // thinking about it is that we need to drop the even elements this many
14690 // times to get the original input.
14691
14692 // First we need to zero all the dropped bytes.
14693 assert(NumEvenDrops <= 3 &&
14694 "No support for dropping even elements more than 3 times.");
14695 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14696 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14697 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14698 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14699 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14700 WordClearMask);
14701 if (!IsSingleInput)
14702 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14703 WordClearMask);
14704
14705 // Now pack things back together.
14706 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14707 IsSingleInput ? V1 : V2);
14708 for (int i = 1; i < NumEvenDrops; ++i) {
14709 Result = DAG.getBitcast(MVT::v8i16, Result);
14710 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14711 }
14712 return Result;
14713 }
14714
14715 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14716 if (NumOddDrops == 1) {
14717 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14718 DAG.getBitcast(MVT::v8i16, V1),
14719 DAG.getTargetConstant(8, DL, MVT::i8));
14720 if (!IsSingleInput)
14721 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14722 DAG.getBitcast(MVT::v8i16, V2),
14723 DAG.getTargetConstant(8, DL, MVT::i8));
14724 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14725 IsSingleInput ? V1 : V2);
14726 }
14727
14728 // Handle multi-input cases by blending/unpacking single-input shuffles.
14729 if (NumV2Elements > 0)
14730 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14731 Zeroable, Subtarget, DAG);
14732
14733 // The fallback path for single-input shuffles widens this into two v8i16
14734 // vectors with unpacks, shuffles those, and then pulls them back together
14735 // with a pack.
14736 SDValue V = V1;
14737
14738 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14739 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14740 for (int i = 0; i < 16; ++i)
14741 if (Mask[i] >= 0)
14742 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14743
14744 SDValue VLoHalf, VHiHalf;
14745 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14746 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14747 // i16s.
14748 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14749 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14750 // Use a mask to drop the high bytes.
14751 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14752 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14753 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14754
14755 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14756 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14757
14758 // Squash the masks to point directly into VLoHalf.
14759 for (int &M : LoBlendMask)
14760 if (M >= 0)
14761 M /= 2;
14762 for (int &M : HiBlendMask)
14763 if (M >= 0)
14764 M /= 2;
14765 } else {
14766 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14767 // VHiHalf so that we can blend them as i16s.
14768 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14769
14770 VLoHalf = DAG.getBitcast(
14771 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14772 VHiHalf = DAG.getBitcast(
14773 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14774 }
14775
14776 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14777 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14778
14779 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14780}
14781
14782/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14783///
14784/// This routine breaks down the specific type of 128-bit shuffle and
14785/// dispatches to the lowering routines accordingly.
14787 MVT VT, SDValue V1, SDValue V2,
14788 const APInt &Zeroable,
14789 const X86Subtarget &Subtarget,
14790 SelectionDAG &DAG) {
14791 if (VT == MVT::v8bf16) {
14792 V1 = DAG.getBitcast(MVT::v8i16, V1);
14793 V2 = DAG.getBitcast(MVT::v8i16, V2);
14794 return DAG.getBitcast(VT,
14795 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14796 }
14797
14798 switch (VT.SimpleTy) {
14799 case MVT::v2i64:
14800 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14801 case MVT::v2f64:
14802 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14803 case MVT::v4i32:
14804 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14805 case MVT::v4f32:
14806 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14807 case MVT::v8i16:
14808 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14809 case MVT::v8f16:
14810 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14811 case MVT::v16i8:
14812 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14813
14814 default:
14815 llvm_unreachable("Unimplemented!");
14816 }
14817}
14818
14819/// Generic routine to split vector shuffle into half-sized shuffles.
14820///
14821/// This routine just extracts two subvectors, shuffles them independently, and
14822/// then concatenates them back together. This should work effectively with all
14823/// AVX vector shuffle types.
14825 SDValue V2, ArrayRef<int> Mask,
14826 SelectionDAG &DAG, bool SimpleOnly) {
14827 assert(VT.getSizeInBits() >= 256 &&
14828 "Only for 256-bit or wider vector shuffles!");
14829 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14830 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14831
14832 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14833 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14834
14835 int NumElements = VT.getVectorNumElements();
14836 int SplitNumElements = NumElements / 2;
14837 MVT ScalarVT = VT.getVectorElementType();
14838 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14839
14840 // Use splitVector/extractSubVector so that split build-vectors just build two
14841 // narrower build vectors. This helps shuffling with splats and zeros.
14842 auto SplitVector = [&](SDValue V) {
14843 SDValue LoV, HiV;
14844 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14845 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14846 DAG.getBitcast(SplitVT, HiV));
14847 };
14848
14849 SDValue LoV1, HiV1, LoV2, HiV2;
14850 std::tie(LoV1, HiV1) = SplitVector(V1);
14851 std::tie(LoV2, HiV2) = SplitVector(V2);
14852
14853 // Now create two 4-way blends of these half-width vectors.
14854 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14855 bool &UseHiV1, bool &UseLoV2,
14856 bool &UseHiV2) {
14857 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14858 for (int i = 0; i < SplitNumElements; ++i) {
14859 int M = HalfMask[i];
14860 if (M >= NumElements) {
14861 if (M >= NumElements + SplitNumElements)
14862 UseHiV2 = true;
14863 else
14864 UseLoV2 = true;
14865 } else if (M >= 0) {
14866 if (M >= SplitNumElements)
14867 UseHiV1 = true;
14868 else
14869 UseLoV1 = true;
14870 }
14871 }
14872 };
14873
14874 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14875 if (!SimpleOnly)
14876 return true;
14877
14878 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14879 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14880
14881 return !(UseHiV1 || UseHiV2);
14882 };
14883
14884 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14885 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14886 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14887 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14888 for (int i = 0; i < SplitNumElements; ++i) {
14889 int M = HalfMask[i];
14890 if (M >= NumElements) {
14891 V2BlendMask[i] = M - NumElements;
14892 BlendMask[i] = SplitNumElements + i;
14893 } else if (M >= 0) {
14894 V1BlendMask[i] = M;
14895 BlendMask[i] = i;
14896 }
14897 }
14898
14899 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14900 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14901
14902 // Because the lowering happens after all combining takes place, we need to
14903 // manually combine these blend masks as much as possible so that we create
14904 // a minimal number of high-level vector shuffle nodes.
14905 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14906
14907 // First try just blending the halves of V1 or V2.
14908 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14909 return DAG.getUNDEF(SplitVT);
14910 if (!UseLoV2 && !UseHiV2)
14911 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14912 if (!UseLoV1 && !UseHiV1)
14913 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14914
14915 SDValue V1Blend, V2Blend;
14916 if (UseLoV1 && UseHiV1) {
14917 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14918 } else {
14919 // We only use half of V1 so map the usage down into the final blend mask.
14920 V1Blend = UseLoV1 ? LoV1 : HiV1;
14921 for (int i = 0; i < SplitNumElements; ++i)
14922 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14923 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14924 }
14925 if (UseLoV2 && UseHiV2) {
14926 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14927 } else {
14928 // We only use half of V2 so map the usage down into the final blend mask.
14929 V2Blend = UseLoV2 ? LoV2 : HiV2;
14930 for (int i = 0; i < SplitNumElements; ++i)
14931 if (BlendMask[i] >= SplitNumElements)
14932 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14933 }
14934 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14935 };
14936
14937 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14938 return SDValue();
14939
14940 SDValue Lo = HalfBlend(LoMask);
14941 SDValue Hi = HalfBlend(HiMask);
14942 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14943}
14944
14945/// Either split a vector in halves or decompose the shuffles and the
14946/// blend/unpack.
14947///
14948/// This is provided as a good fallback for many lowerings of non-single-input
14949/// shuffles with more than one 128-bit lane. In those cases, we want to select
14950/// between splitting the shuffle into 128-bit components and stitching those
14951/// back together vs. extracting the single-input shuffles and blending those
14952/// results.
14954 SDValue V2, ArrayRef<int> Mask,
14955 const APInt &Zeroable,
14956 const X86Subtarget &Subtarget,
14957 SelectionDAG &DAG) {
14958 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14959 "shuffles as it could then recurse on itself.");
14960 int Size = Mask.size();
14961
14962 // If this can be modeled as a broadcast of two elements followed by a blend,
14963 // prefer that lowering. This is especially important because broadcasts can
14964 // often fold with memory operands.
14965 auto DoBothBroadcast = [&] {
14966 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14967 for (int M : Mask)
14968 if (M >= Size) {
14969 if (V2BroadcastIdx < 0)
14970 V2BroadcastIdx = M - Size;
14971 else if (M - Size != V2BroadcastIdx)
14972 return false;
14973 } else if (M >= 0) {
14974 if (V1BroadcastIdx < 0)
14975 V1BroadcastIdx = M;
14976 else if (M != V1BroadcastIdx)
14977 return false;
14978 }
14979 return true;
14980 };
14981 if (DoBothBroadcast())
14982 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
14983 Subtarget, DAG);
14984
14985 // If the inputs all stem from a single 128-bit lane of each input, then we
14986 // split them rather than blending because the split will decompose to
14987 // unusually few instructions.
14988 int LaneCount = VT.getSizeInBits() / 128;
14989 int LaneSize = Size / LaneCount;
14990 SmallBitVector LaneInputs[2];
14991 LaneInputs[0].resize(LaneCount, false);
14992 LaneInputs[1].resize(LaneCount, false);
14993 for (int i = 0; i < Size; ++i)
14994 if (Mask[i] >= 0)
14995 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14996 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14997 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14998 /*SimpleOnly*/ false);
14999
15000 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15001 // requires that the decomposed single-input shuffles don't end up here.
15002 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15003 Subtarget, DAG);
15004}
15005
15006// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15007// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15009 SDValue V1, SDValue V2,
15010 ArrayRef<int> Mask,
15011 SelectionDAG &DAG) {
15012 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15013
15014 int LHSMask[4] = {-1, -1, -1, -1};
15015 int RHSMask[4] = {-1, -1, -1, -1};
15016 int SHUFPDMask[4] = {-1, -1, -1, -1};
15017
15018 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15019 // perform the shuffle once the lanes have been shuffled in place.
15020 for (int i = 0; i != 4; ++i) {
15021 int M = Mask[i];
15022 if (M < 0)
15023 continue;
15024 int LaneBase = i & ~1;
15025 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15026 LaneMask[LaneBase + (M & 1)] = M;
15027 SHUFPDMask[i] = M & 1;
15028 }
15029
15030 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15031 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15032 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15033 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15034}
15035
15036/// Lower a vector shuffle crossing multiple 128-bit lanes as
15037/// a lane permutation followed by a per-lane permutation.
15038///
15039/// This is mainly for cases where we can have non-repeating permutes
15040/// in each lane.
15041///
15042/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15043/// we should investigate merging them.
15045 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15046 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15047 int NumElts = VT.getVectorNumElements();
15048 int NumLanes = VT.getSizeInBits() / 128;
15049 int NumEltsPerLane = NumElts / NumLanes;
15050 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15051
15052 /// Attempts to find a sublane permute with the given size
15053 /// that gets all elements into their target lanes.
15054 ///
15055 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15056 /// If unsuccessful, returns false and may overwrite InLaneMask.
15057 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15058 int NumSublanesPerLane = NumSublanes / NumLanes;
15059 int NumEltsPerSublane = NumElts / NumSublanes;
15060
15061 SmallVector<int, 16> CrossLaneMask;
15062 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15063 // CrossLaneMask but one entry == one sublane.
15064 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15065 APInt DemandedCrossLane = APInt::getZero(NumElts);
15066
15067 for (int i = 0; i != NumElts; ++i) {
15068 int M = Mask[i];
15069 if (M < 0)
15070 continue;
15071
15072 int SrcSublane = M / NumEltsPerSublane;
15073 int DstLane = i / NumEltsPerLane;
15074
15075 // We only need to get the elements into the right lane, not sublane.
15076 // So search all sublanes that make up the destination lane.
15077 bool Found = false;
15078 int DstSubStart = DstLane * NumSublanesPerLane;
15079 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15080 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15081 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15082 continue;
15083
15084 Found = true;
15085 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15086 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15087 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15088 DemandedCrossLane.setBit(InLaneMask[i]);
15089 break;
15090 }
15091 if (!Found)
15092 return SDValue();
15093 }
15094
15095 // Fill CrossLaneMask using CrossLaneMaskLarge.
15096 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15097
15098 if (!CanUseSublanes) {
15099 // If we're only shuffling a single lowest lane and the rest are identity
15100 // then don't bother.
15101 // TODO - isShuffleMaskInputInPlace could be extended to something like
15102 // this.
15103 int NumIdentityLanes = 0;
15104 bool OnlyShuffleLowestLane = true;
15105 for (int i = 0; i != NumLanes; ++i) {
15106 int LaneOffset = i * NumEltsPerLane;
15107 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15108 i * NumEltsPerLane))
15109 NumIdentityLanes++;
15110 else if (CrossLaneMask[LaneOffset] != 0)
15111 OnlyShuffleLowestLane = false;
15112 }
15113 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15114 return SDValue();
15115 }
15116
15117 // Avoid returning the same shuffle operation. For example,
15118 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15119 // undef:v16i16
15120 if (CrossLaneMask == Mask || InLaneMask == Mask)
15121 return SDValue();
15122
15123 // Simplify CrossLaneMask based on the actual demanded elements.
15124 if (V1.hasOneUse())
15125 for (int i = 0; i != NumElts; ++i)
15126 if (!DemandedCrossLane[i])
15127 CrossLaneMask[i] = SM_SentinelUndef;
15128
15129 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15130 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15131 InLaneMask);
15132 };
15133
15134 // First attempt a solution with full lanes.
15135 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15136 return V;
15137
15138 // The rest of the solutions use sublanes.
15139 if (!CanUseSublanes)
15140 return SDValue();
15141
15142 // Then attempt a solution with 64-bit sublanes (vpermq).
15143 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15144 return V;
15145
15146 // If that doesn't work and we have fast variable cross-lane shuffle,
15147 // attempt 32-bit sublanes (vpermd).
15148 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15149 return SDValue();
15150
15151 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15152}
15153
15154/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15155static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15156 SmallVector<int> &InLaneMask) {
15157 int Size = Mask.size();
15158 InLaneMask.assign(Mask.begin(), Mask.end());
15159 for (int i = 0; i < Size; ++i) {
15160 int &M = InLaneMask[i];
15161 if (M < 0)
15162 continue;
15163 if (((M % Size) / LaneSize) != (i / LaneSize))
15164 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15165 }
15166}
15167
15168/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15169/// source with a lane permutation.
15170///
15171/// This lowering strategy results in four instructions in the worst case for a
15172/// single-input cross lane shuffle which is lower than any other fully general
15173/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15174/// shuffle pattern should be handled prior to trying this lowering.
15176 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15177 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15178 // FIXME: This should probably be generalized for 512-bit vectors as well.
15179 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15180 int Size = Mask.size();
15181 int LaneSize = Size / 2;
15182
15183 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15184 // Only do this if the elements aren't all from the lower lane,
15185 // otherwise we're (probably) better off doing a split.
15186 if (VT == MVT::v4f64 &&
15187 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15188 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15189
15190 // If there are only inputs from one 128-bit lane, splitting will in fact be
15191 // less expensive. The flags track whether the given lane contains an element
15192 // that crosses to another lane.
15193 bool AllLanes;
15194 if (!Subtarget.hasAVX2()) {
15195 bool LaneCrossing[2] = {false, false};
15196 for (int i = 0; i < Size; ++i)
15197 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15198 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15199 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15200 } else {
15201 bool LaneUsed[2] = {false, false};
15202 for (int i = 0; i < Size; ++i)
15203 if (Mask[i] >= 0)
15204 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15205 AllLanes = LaneUsed[0] && LaneUsed[1];
15206 }
15207
15208 // TODO - we could support shuffling V2 in the Flipped input.
15209 assert(V2.isUndef() &&
15210 "This last part of this routine only works on single input shuffles");
15211
15212 SmallVector<int> InLaneMask;
15213 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15214
15215 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15216 "In-lane shuffle mask expected");
15217
15218 // If we're not using both lanes in each lane and the inlane mask is not
15219 // repeating, then we're better off splitting.
15220 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15221 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15222 /*SimpleOnly*/ false);
15223
15224 // Flip the lanes, and shuffle the results which should now be in-lane.
15225 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15226 SDValue Flipped = DAG.getBitcast(PVT, V1);
15227 Flipped =
15228 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15229 Flipped = DAG.getBitcast(VT, Flipped);
15230 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15231}
15232
15233/// Handle lowering 2-lane 128-bit shuffles.
15235 SDValue V2, ArrayRef<int> Mask,
15236 const APInt &Zeroable,
15237 const X86Subtarget &Subtarget,
15238 SelectionDAG &DAG) {
15239 if (V2.isUndef()) {
15240 // Attempt to match VBROADCAST*128 subvector broadcast load.
15241 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15242 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15243 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15245 MVT MemVT = VT.getHalfNumVectorElementsVT();
15246 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15247 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
15249 VT, MemVT, Ld, Ofs, DAG))
15250 return BcstLd;
15251 }
15252
15253 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15254 if (Subtarget.hasAVX2())
15255 return SDValue();
15256 }
15257
15258 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15259
15260 SmallVector<int, 4> WidenedMask;
15261 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15262 return SDValue();
15263
15264 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15265 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15266
15267 // Try to use an insert into a zero vector.
15268 if (WidenedMask[0] == 0 && IsHighZero) {
15269 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15270 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15271 DAG.getVectorIdxConstant(0, DL));
15272 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15273 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15274 DAG.getVectorIdxConstant(0, DL));
15275 }
15276
15277 // TODO: If minimizing size and one of the inputs is a zero vector and the
15278 // the zero vector has only one use, we could use a VPERM2X128 to save the
15279 // instruction bytes needed to explicitly generate the zero vector.
15280
15281 // Blends are faster and handle all the non-lane-crossing cases.
15282 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15283 Subtarget, DAG))
15284 return Blend;
15285
15286 // If either input operand is a zero vector, use VPERM2X128 because its mask
15287 // allows us to replace the zero input with an implicit zero.
15288 if (!IsLowZero && !IsHighZero) {
15289 // Check for patterns which can be matched with a single insert of a 128-bit
15290 // subvector.
15291 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15292 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15293
15294 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15295 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15296 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15297 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15298 SDValue SubVec =
15299 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15300 DAG.getVectorIdxConstant(0, DL));
15301 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15302 DAG.getVectorIdxConstant(2, DL));
15303 }
15304 }
15305
15306 // Try to use SHUF128 if possible.
15307 if (Subtarget.hasVLX()) {
15308 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15309 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15310 ((WidenedMask[1] % 2) << 1);
15311 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15312 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15313 }
15314 }
15315 }
15316
15317 // Otherwise form a 128-bit permutation. After accounting for undefs,
15318 // convert the 64-bit shuffle mask selection values into 128-bit
15319 // selection bits by dividing the indexes by 2 and shifting into positions
15320 // defined by a vperm2*128 instruction's immediate control byte.
15321
15322 // The immediate permute control byte looks like this:
15323 // [1:0] - select 128 bits from sources for low half of destination
15324 // [2] - ignore
15325 // [3] - zero low half of destination
15326 // [5:4] - select 128 bits from sources for high half of destination
15327 // [6] - ignore
15328 // [7] - zero high half of destination
15329
15330 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15331 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15332
15333 unsigned PermMask = 0;
15334 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15335 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15336
15337 // Check the immediate mask and replace unused sources with undef.
15338 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15339 V1 = DAG.getUNDEF(VT);
15340 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15341 V2 = DAG.getUNDEF(VT);
15342
15343 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15344 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15345}
15346
15347/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15348/// shuffling each lane.
15349///
15350/// This attempts to create a repeated lane shuffle where each lane uses one
15351/// or two of the lanes of the inputs. The lanes of the input vectors are
15352/// shuffled in one or two independent shuffles to get the lanes into the
15353/// position needed by the final shuffle.
15355 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15356 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15357 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15358
15359 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15360 return SDValue();
15361
15362 int NumElts = Mask.size();
15363 int NumLanes = VT.getSizeInBits() / 128;
15364 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15365 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15366 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15367
15368 // First pass will try to fill in the RepeatMask from lanes that need two
15369 // sources.
15370 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15371 int Srcs[2] = {-1, -1};
15372 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15373 for (int i = 0; i != NumLaneElts; ++i) {
15374 int M = Mask[(Lane * NumLaneElts) + i];
15375 if (M < 0)
15376 continue;
15377 // Determine which of the possible input lanes (NumLanes from each source)
15378 // this element comes from. Assign that as one of the sources for this
15379 // lane. We can assign up to 2 sources for this lane. If we run out
15380 // sources we can't do anything.
15381 int LaneSrc = M / NumLaneElts;
15382 int Src;
15383 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15384 Src = 0;
15385 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15386 Src = 1;
15387 else
15388 return SDValue();
15389
15390 Srcs[Src] = LaneSrc;
15391 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15392 }
15393
15394 // If this lane has two sources, see if it fits with the repeat mask so far.
15395 if (Srcs[1] < 0)
15396 continue;
15397
15398 LaneSrcs[Lane][0] = Srcs[0];
15399 LaneSrcs[Lane][1] = Srcs[1];
15400
15401 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15402 assert(M1.size() == M2.size() && "Unexpected mask size");
15403 for (int i = 0, e = M1.size(); i != e; ++i)
15404 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15405 return false;
15406 return true;
15407 };
15408
15409 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15410 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15411 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15412 int M = Mask[i];
15413 if (M < 0)
15414 continue;
15415 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15416 "Unexpected mask element");
15417 MergedMask[i] = M;
15418 }
15419 };
15420
15421 if (MatchMasks(InLaneMask, RepeatMask)) {
15422 // Merge this lane mask into the final repeat mask.
15423 MergeMasks(InLaneMask, RepeatMask);
15424 continue;
15425 }
15426
15427 // Didn't find a match. Swap the operands and try again.
15428 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15430
15431 if (MatchMasks(InLaneMask, RepeatMask)) {
15432 // Merge this lane mask into the final repeat mask.
15433 MergeMasks(InLaneMask, RepeatMask);
15434 continue;
15435 }
15436
15437 // Couldn't find a match with the operands in either order.
15438 return SDValue();
15439 }
15440
15441 // Now handle any lanes with only one source.
15442 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15443 // If this lane has already been processed, skip it.
15444 if (LaneSrcs[Lane][0] >= 0)
15445 continue;
15446
15447 for (int i = 0; i != NumLaneElts; ++i) {
15448 int M = Mask[(Lane * NumLaneElts) + i];
15449 if (M < 0)
15450 continue;
15451
15452 // If RepeatMask isn't defined yet we can define it ourself.
15453 if (RepeatMask[i] < 0)
15454 RepeatMask[i] = M % NumLaneElts;
15455
15456 if (RepeatMask[i] < NumElts) {
15457 if (RepeatMask[i] != M % NumLaneElts)
15458 return SDValue();
15459 LaneSrcs[Lane][0] = M / NumLaneElts;
15460 } else {
15461 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15462 return SDValue();
15463 LaneSrcs[Lane][1] = M / NumLaneElts;
15464 }
15465 }
15466
15467 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15468 return SDValue();
15469 }
15470
15471 SmallVector<int, 16> NewMask(NumElts, -1);
15472 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15473 int Src = LaneSrcs[Lane][0];
15474 for (int i = 0; i != NumLaneElts; ++i) {
15475 int M = -1;
15476 if (Src >= 0)
15477 M = Src * NumLaneElts + i;
15478 NewMask[Lane * NumLaneElts + i] = M;
15479 }
15480 }
15481 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15482 // Ensure we didn't get back the shuffle we started with.
15483 // FIXME: This is a hack to make up for some splat handling code in
15484 // getVectorShuffle.
15485 if (isa<ShuffleVectorSDNode>(NewV1) &&
15486 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15487 return SDValue();
15488
15489 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15490 int Src = LaneSrcs[Lane][1];
15491 for (int i = 0; i != NumLaneElts; ++i) {
15492 int M = -1;
15493 if (Src >= 0)
15494 M = Src * NumLaneElts + i;
15495 NewMask[Lane * NumLaneElts + i] = M;
15496 }
15497 }
15498 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15499 // Ensure we didn't get back the shuffle we started with.
15500 // FIXME: This is a hack to make up for some splat handling code in
15501 // getVectorShuffle.
15502 if (isa<ShuffleVectorSDNode>(NewV2) &&
15503 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15504 return SDValue();
15505
15506 for (int i = 0; i != NumElts; ++i) {
15507 if (Mask[i] < 0) {
15508 NewMask[i] = -1;
15509 continue;
15510 }
15511 NewMask[i] = RepeatMask[i % NumLaneElts];
15512 if (NewMask[i] < 0)
15513 continue;
15514
15515 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15516 }
15517 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15518}
15519
15520/// If the input shuffle mask results in a vector that is undefined in all upper
15521/// or lower half elements and that mask accesses only 2 halves of the
15522/// shuffle's operands, return true. A mask of half the width with mask indexes
15523/// adjusted to access the extracted halves of the original shuffle operands is
15524/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15525/// lower half of each input operand is accessed.
15526static bool
15528 int &HalfIdx1, int &HalfIdx2) {
15529 assert((Mask.size() == HalfMask.size() * 2) &&
15530 "Expected input mask to be twice as long as output");
15531
15532 // Exactly one half of the result must be undef to allow narrowing.
15533 bool UndefLower = isUndefLowerHalf(Mask);
15534 bool UndefUpper = isUndefUpperHalf(Mask);
15535 if (UndefLower == UndefUpper)
15536 return false;
15537
15538 unsigned HalfNumElts = HalfMask.size();
15539 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15540 HalfIdx1 = -1;
15541 HalfIdx2 = -1;
15542 for (unsigned i = 0; i != HalfNumElts; ++i) {
15543 int M = Mask[i + MaskIndexOffset];
15544 if (M < 0) {
15545 HalfMask[i] = M;
15546 continue;
15547 }
15548
15549 // Determine which of the 4 half vectors this element is from.
15550 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15551 int HalfIdx = M / HalfNumElts;
15552
15553 // Determine the element index into its half vector source.
15554 int HalfElt = M % HalfNumElts;
15555
15556 // We can shuffle with up to 2 half vectors, set the new 'half'
15557 // shuffle mask accordingly.
15558 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15559 HalfMask[i] = HalfElt;
15560 HalfIdx1 = HalfIdx;
15561 continue;
15562 }
15563 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15564 HalfMask[i] = HalfElt + HalfNumElts;
15565 HalfIdx2 = HalfIdx;
15566 continue;
15567 }
15568
15569 // Too many half vectors referenced.
15570 return false;
15571 }
15572
15573 return true;
15574}
15575
15576/// Given the output values from getHalfShuffleMask(), create a half width
15577/// shuffle of extracted vectors followed by an insert back to full width.
15579 ArrayRef<int> HalfMask, int HalfIdx1,
15580 int HalfIdx2, bool UndefLower,
15581 SelectionDAG &DAG, bool UseConcat = false) {
15582 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15583 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15584
15585 MVT VT = V1.getSimpleValueType();
15586 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15587 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15588
15589 auto getHalfVector = [&](int HalfIdx) {
15590 if (HalfIdx < 0)
15591 return DAG.getUNDEF(HalfVT);
15592 SDValue V = (HalfIdx < 2 ? V1 : V2);
15593 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15594 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15595 DAG.getVectorIdxConstant(HalfIdx, DL));
15596 };
15597
15598 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15599 SDValue Half1 = getHalfVector(HalfIdx1);
15600 SDValue Half2 = getHalfVector(HalfIdx2);
15601 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15602 if (UseConcat) {
15603 SDValue Op0 = V;
15604 SDValue Op1 = DAG.getUNDEF(HalfVT);
15605 if (UndefLower)
15606 std::swap(Op0, Op1);
15607 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15608 }
15609
15610 unsigned Offset = UndefLower ? HalfNumElts : 0;
15611 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15613}
15614
15615/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15616/// This allows for fast cases such as subvector extraction/insertion
15617/// or shuffling smaller vector types which can lower more efficiently.
15619 SDValue V2, ArrayRef<int> Mask,
15620 const X86Subtarget &Subtarget,
15621 SelectionDAG &DAG) {
15622 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15623 "Expected 256-bit or 512-bit vector");
15624
15625 bool UndefLower = isUndefLowerHalf(Mask);
15626 if (!UndefLower && !isUndefUpperHalf(Mask))
15627 return SDValue();
15628
15629 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15630 "Completely undef shuffle mask should have been simplified already");
15631
15632 // Upper half is undef and lower half is whole upper subvector.
15633 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15634 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15635 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15636 if (!UndefLower &&
15637 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15638 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15639 DAG.getVectorIdxConstant(HalfNumElts, DL));
15640 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15641 DAG.getVectorIdxConstant(0, DL));
15642 }
15643
15644 // Lower half is undef and upper half is whole lower subvector.
15645 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15646 if (UndefLower &&
15647 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15648 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15649 DAG.getVectorIdxConstant(0, DL));
15650 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15651 DAG.getVectorIdxConstant(HalfNumElts, DL));
15652 }
15653
15654 int HalfIdx1, HalfIdx2;
15655 SmallVector<int, 8> HalfMask(HalfNumElts);
15656 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15657 return SDValue();
15658
15659 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15660
15661 // Only shuffle the halves of the inputs when useful.
15662 unsigned NumLowerHalves =
15663 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15664 unsigned NumUpperHalves =
15665 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15666 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15667
15668 // Determine the larger pattern of undef/halves, then decide if it's worth
15669 // splitting the shuffle based on subtarget capabilities and types.
15670 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15671 if (!UndefLower) {
15672 // XXXXuuuu: no insert is needed.
15673 // Always extract lowers when setting lower - these are all free subreg ops.
15674 if (NumUpperHalves == 0)
15675 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15676 UndefLower, DAG);
15677
15678 if (NumUpperHalves == 1) {
15679 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15680 if (Subtarget.hasAVX2()) {
15681 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15682 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15683 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15684 (!isSingleSHUFPSMask(HalfMask) ||
15685 Subtarget.hasFastVariableCrossLaneShuffle()))
15686 return SDValue();
15687 // If this is an unary shuffle (assume that the 2nd operand is
15688 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15689 // are better off extracting the upper half of 1 operand and using a
15690 // narrow shuffle.
15691 if (EltWidth == 64 && V2.isUndef())
15692 return SDValue();
15693 // If this is an unary vXi8 shuffle with inplace halves, then perform as
15694 // full width pshufb, and then merge.
15695 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15696 return SDValue();
15697 }
15698 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15699 if (Subtarget.hasAVX512() && VT.is512BitVector())
15700 return SDValue();
15701 // Extract + narrow shuffle is better than the wide alternative.
15702 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15703 UndefLower, DAG);
15704 }
15705
15706 // Don't extract both uppers, instead shuffle and then extract.
15707 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15708 return SDValue();
15709 }
15710
15711 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15712 if (NumUpperHalves == 0) {
15713 // AVX2 has efficient 64-bit element cross-lane shuffles.
15714 // TODO: Refine to account for unary shuffle, splat, and other masks?
15715 if (Subtarget.hasAVX2() && EltWidth == 64)
15716 return SDValue();
15717 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15718 if (Subtarget.hasAVX512() && VT.is512BitVector())
15719 return SDValue();
15720 // Narrow shuffle + insert is better than the wide alternative.
15721 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15722 UndefLower, DAG);
15723 }
15724
15725 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15726 return SDValue();
15727}
15728
15729/// Handle case where shuffle sources are coming from the same 128-bit lane and
15730/// every lane can be represented as the same repeating mask - allowing us to
15731/// shuffle the sources with the repeating shuffle and then permute the result
15732/// to the destination lanes.
15734 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15735 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15736 int NumElts = VT.getVectorNumElements();
15737 int NumLanes = VT.getSizeInBits() / 128;
15738 int NumLaneElts = NumElts / NumLanes;
15739
15740 // On AVX2 we may be able to just shuffle the lowest elements and then
15741 // broadcast the result.
15742 if (Subtarget.hasAVX2()) {
15743 for (unsigned BroadcastSize : {16, 32, 64}) {
15744 if (BroadcastSize <= VT.getScalarSizeInBits())
15745 continue;
15746 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15747
15748 // Attempt to match a repeating pattern every NumBroadcastElts,
15749 // accounting for UNDEFs but only references the lowest 128-bit
15750 // lane of the inputs.
15751 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15752 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15753 for (int j = 0; j != NumBroadcastElts; ++j) {
15754 int M = Mask[i + j];
15755 if (M < 0)
15756 continue;
15757 int &R = RepeatMask[j];
15758 if (0 != ((M % NumElts) / NumLaneElts))
15759 return false;
15760 if (0 <= R && R != M)
15761 return false;
15762 R = M;
15763 }
15764 return true;
15765 };
15766
15767 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15768 if (!FindRepeatingBroadcastMask(RepeatMask))
15769 continue;
15770
15771 // Shuffle the (lowest) repeated elements in place for broadcast.
15772 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15773
15774 // Shuffle the actual broadcast.
15775 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15776 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15777 for (int j = 0; j != NumBroadcastElts; ++j)
15778 BroadcastMask[i + j] = j;
15779
15780 // Avoid returning the same shuffle operation. For example,
15781 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15782 if (BroadcastMask == Mask)
15783 return SDValue();
15784
15785 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15786 BroadcastMask);
15787 }
15788 }
15789
15790 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15791 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15792 return SDValue();
15793
15794 // Bail if we already have a repeated lane shuffle mask.
15795 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15796 return SDValue();
15797
15798 // Helper to look for repeated mask in each split sublane, and that those
15799 // sublanes can then be permuted into place.
15800 auto ShuffleSubLanes = [&](int SubLaneScale) {
15801 int NumSubLanes = NumLanes * SubLaneScale;
15802 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15803
15804 // Check that all the sources are coming from the same lane and see if we
15805 // can form a repeating shuffle mask (local to each sub-lane). At the same
15806 // time, determine the source sub-lane for each destination sub-lane.
15807 int TopSrcSubLane = -1;
15808 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15809 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15810 SubLaneScale,
15811 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15812
15813 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15814 // Extract the sub-lane mask, check that it all comes from the same lane
15815 // and normalize the mask entries to come from the first lane.
15816 int SrcLane = -1;
15817 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15818 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15819 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15820 if (M < 0)
15821 continue;
15822 int Lane = (M % NumElts) / NumLaneElts;
15823 if ((0 <= SrcLane) && (SrcLane != Lane))
15824 return SDValue();
15825 SrcLane = Lane;
15826 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15827 SubLaneMask[Elt] = LocalM;
15828 }
15829
15830 // Whole sub-lane is UNDEF.
15831 if (SrcLane < 0)
15832 continue;
15833
15834 // Attempt to match against the candidate repeated sub-lane masks.
15835 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15836 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15837 for (int i = 0; i != NumSubLaneElts; ++i) {
15838 if (M1[i] < 0 || M2[i] < 0)
15839 continue;
15840 if (M1[i] != M2[i])
15841 return false;
15842 }
15843 return true;
15844 };
15845
15846 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15847 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15848 continue;
15849
15850 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15851 for (int i = 0; i != NumSubLaneElts; ++i) {
15852 int M = SubLaneMask[i];
15853 if (M < 0)
15854 continue;
15855 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15856 "Unexpected mask element");
15857 RepeatedSubLaneMask[i] = M;
15858 }
15859
15860 // Track the top most source sub-lane - by setting the remaining to
15861 // UNDEF we can greatly simplify shuffle matching.
15862 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15863 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15864 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15865 break;
15866 }
15867
15868 // Bail if we failed to find a matching repeated sub-lane mask.
15869 if (Dst2SrcSubLanes[DstSubLane] < 0)
15870 return SDValue();
15871 }
15872 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15873 "Unexpected source lane");
15874
15875 // Create a repeating shuffle mask for the entire vector.
15876 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15877 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15878 int Lane = SubLane / SubLaneScale;
15879 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15880 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15881 int M = RepeatedSubLaneMask[Elt];
15882 if (M < 0)
15883 continue;
15884 int Idx = (SubLane * NumSubLaneElts) + Elt;
15885 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15886 }
15887 }
15888
15889 // Shuffle each source sub-lane to its destination.
15890 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15891 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15892 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15893 if (SrcSubLane < 0)
15894 continue;
15895 for (int j = 0; j != NumSubLaneElts; ++j)
15896 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15897 }
15898
15899 // Avoid returning the same shuffle operation.
15900 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15901 if (RepeatedMask == Mask || SubLaneMask == Mask)
15902 return SDValue();
15903
15904 SDValue RepeatedShuffle =
15905 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15906
15907 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15908 SubLaneMask);
15909 };
15910
15911 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15912 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15913 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15914 // Otherwise we can only permute whole 128-bit lanes.
15915 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15916 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15917 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15918 MinSubLaneScale = 2;
15919 MaxSubLaneScale =
15920 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15921 }
15922 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15923 MinSubLaneScale = MaxSubLaneScale = 4;
15924
15925 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15926 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15927 return Shuffle;
15928
15929 return SDValue();
15930}
15931
15933 bool &ForceV1Zero, bool &ForceV2Zero,
15934 unsigned &ShuffleImm, ArrayRef<int> Mask,
15935 const APInt &Zeroable) {
15936 int NumElts = VT.getVectorNumElements();
15937 assert(VT.getScalarSizeInBits() == 64 &&
15938 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15939 "Unexpected data type for VSHUFPD");
15940 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15941 "Illegal shuffle mask");
15942
15943 bool ZeroLane[2] = { true, true };
15944 for (int i = 0; i < NumElts; ++i)
15945 ZeroLane[i & 1] &= Zeroable[i];
15946
15947 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15948 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15949 bool IsSHUFPD = true;
15950 bool IsCommutable = true;
15951 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
15952 for (int i = 0; i < NumElts; ++i) {
15953 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15954 continue;
15955 if (Mask[i] < 0)
15956 return false;
15957 int Val = (i & 6) + NumElts * (i & 1);
15958 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15959 if (Mask[i] < Val || Mask[i] > Val + 1)
15960 IsSHUFPD = false;
15961 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15962 IsCommutable = false;
15963 SHUFPDMask[i] = Mask[i] % 2;
15964 }
15965
15966 if (!IsSHUFPD && !IsCommutable)
15967 return false;
15968
15969 if (!IsSHUFPD && IsCommutable)
15970 std::swap(V1, V2);
15971
15972 ForceV1Zero = ZeroLane[0];
15973 ForceV2Zero = ZeroLane[1];
15974 ShuffleImm = getSHUFPDImm(SHUFPDMask);
15975 return true;
15976}
15977
15979 SDValue V2, ArrayRef<int> Mask,
15980 const APInt &Zeroable,
15981 const X86Subtarget &Subtarget,
15982 SelectionDAG &DAG) {
15983 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15984 "Unexpected data type for VSHUFPD");
15985
15986 unsigned Immediate = 0;
15987 bool ForceV1Zero = false, ForceV2Zero = false;
15988 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15989 Mask, Zeroable))
15990 return SDValue();
15991
15992 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15993 if (ForceV1Zero)
15994 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15995 if (ForceV2Zero)
15996 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15997
15998 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15999 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16000}
16001
16002// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16003// by zeroable elements in the remaining 24 elements. Turn this into two
16004// vmovqb instructions shuffled together.
16006 SDValue V1, SDValue V2,
16007 ArrayRef<int> Mask,
16008 const APInt &Zeroable,
16009 SelectionDAG &DAG) {
16010 assert(VT == MVT::v32i8 && "Unexpected type!");
16011
16012 // The first 8 indices should be every 8th element.
16013 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16014 return SDValue();
16015
16016 // Remaining elements need to be zeroable.
16017 if (Zeroable.countl_one() < (Mask.size() - 8))
16018 return SDValue();
16019
16020 V1 = DAG.getBitcast(MVT::v4i64, V1);
16021 V2 = DAG.getBitcast(MVT::v4i64, V2);
16022
16023 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16024 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16025
16026 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16027 // the upper bits of the result using an unpckldq.
16028 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16029 { 0, 1, 2, 3, 16, 17, 18, 19,
16030 4, 5, 6, 7, 20, 21, 22, 23 });
16031 // Insert the unpckldq into a zero vector to widen to v32i8.
16032 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16033 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16034 DAG.getVectorIdxConstant(0, DL));
16035}
16036
16037// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16038// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16039// =>
16040// ul = unpckl v1, v2
16041// uh = unpckh v1, v2
16042// a = vperm ul, uh
16043// b = vperm ul, uh
16044//
16045// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16046// and permute. We cannot directly match v3 because it is split into two
16047// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16048// pair of 256-bit shuffles and makes sure the masks are consecutive.
16049//
16050// Once unpck and permute nodes are created, the permute corresponding to this
16051// shuffle is returned, while the other permute replaces the other half of the
16052// shuffle in the selection dag.
16054 SDValue V1, SDValue V2,
16055 ArrayRef<int> Mask,
16056 SelectionDAG &DAG) {
16057 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16058 VT != MVT::v32i8)
16059 return SDValue();
16060 // <B0, B1, B0+1, B1+1, ..., >
16061 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16062 unsigned Begin1) {
16063 size_t Size = Mask.size();
16064 assert(Size % 2 == 0 && "Expected even mask size");
16065 for (unsigned I = 0; I < Size; I += 2) {
16066 if (Mask[I] != (int)(Begin0 + I / 2) ||
16067 Mask[I + 1] != (int)(Begin1 + I / 2))
16068 return false;
16069 }
16070 return true;
16071 };
16072 // Check which half is this shuffle node
16073 int NumElts = VT.getVectorNumElements();
16074 size_t FirstQtr = NumElts / 2;
16075 size_t ThirdQtr = NumElts + NumElts / 2;
16076 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16077 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16078 if (!IsFirstHalf && !IsSecondHalf)
16079 return SDValue();
16080
16081 // Find the intersection between shuffle users of V1 and V2.
16082 SmallVector<SDNode *, 2> Shuffles;
16083 for (SDNode *User : V1->users())
16084 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16085 User->getOperand(1) == V2)
16086 Shuffles.push_back(User);
16087 // Limit user size to two for now.
16088 if (Shuffles.size() != 2)
16089 return SDValue();
16090 // Find out which half of the 512-bit shuffles is each smaller shuffle
16091 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16092 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16093 SDNode *FirstHalf;
16094 SDNode *SecondHalf;
16095 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16096 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16097 FirstHalf = Shuffles[0];
16098 SecondHalf = Shuffles[1];
16099 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16100 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16101 FirstHalf = Shuffles[1];
16102 SecondHalf = Shuffles[0];
16103 } else {
16104 return SDValue();
16105 }
16106 // Lower into unpck and perm. Return the perm of this shuffle and replace
16107 // the other.
16108 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16109 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16110 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16111 DAG.getTargetConstant(0x20, DL, MVT::i8));
16112 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16113 DAG.getTargetConstant(0x31, DL, MVT::i8));
16114 if (IsFirstHalf) {
16115 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16116 return Perm1;
16117 }
16118 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16119 return Perm2;
16120}
16121
16122/// Handle lowering of 4-lane 64-bit floating point shuffles.
16123///
16124/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16125/// isn't available.
16127 const APInt &Zeroable, SDValue V1, SDValue V2,
16128 const X86Subtarget &Subtarget,
16129 SelectionDAG &DAG) {
16130 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16131 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16132 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16133
16134 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16135 Subtarget, DAG))
16136 return V;
16137
16138 if (V2.isUndef()) {
16139 // Check for being able to broadcast a single element.
16140 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16141 Mask, Subtarget, DAG))
16142 return Broadcast;
16143
16144 // Use low duplicate instructions for masks that match their pattern.
16145 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16146 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16147
16148 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16149 // Non-half-crossing single input shuffles can be lowered with an
16150 // interleaved permutation.
16151 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16152 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16153 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16154 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16155 }
16156
16157 // With AVX2 we have direct support for this permutation.
16158 if (Subtarget.hasAVX2())
16159 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16160 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16161
16162 // Try to create an in-lane repeating shuffle mask and then shuffle the
16163 // results into the target lanes.
16165 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16166 return V;
16167
16168 // Try to permute the lanes and then use a per-lane permute.
16169 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16170 Mask, DAG, Subtarget))
16171 return V;
16172
16173 // Otherwise, fall back.
16174 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16175 DAG, Subtarget);
16176 }
16177
16178 // Use dedicated unpack instructions for masks that match their pattern.
16179 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16180 return V;
16181
16182 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16183 Zeroable, Subtarget, DAG))
16184 return Blend;
16185
16186 // Check if the blend happens to exactly fit that of SHUFPD.
16187 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16188 Zeroable, Subtarget, DAG))
16189 return Op;
16190
16191 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16192 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16193
16194 // If we have lane crossing shuffles AND they don't all come from the lower
16195 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16196 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16197 // canonicalize to a blend of splat which isn't necessary for this combine.
16198 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16199 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16200 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16201 (V2.getOpcode() != ISD::BUILD_VECTOR))
16202 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16203
16204 // If we have one input in place, then we can permute the other input and
16205 // blend the result.
16206 if (V1IsInPlace || V2IsInPlace)
16207 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16208 Zeroable, Subtarget, DAG);
16209
16210 // Try to create an in-lane repeating shuffle mask and then shuffle the
16211 // results into the target lanes.
16213 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16214 return V;
16215
16216 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16217 // shuffle. However, if we have AVX2 and either inputs are already in place,
16218 // we will be able to shuffle even across lanes the other input in a single
16219 // instruction so skip this pattern.
16220 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16222 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16223 return V;
16224
16225 // If we have VLX support, we can use VEXPAND.
16226 if (Subtarget.hasVLX())
16227 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16228 Zeroable, Subtarget, DAG))
16229 return V;
16230
16231 // If we have AVX2 then we always want to lower with a blend because an v4 we
16232 // can fully permute the elements.
16233 if (Subtarget.hasAVX2())
16234 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16235 Zeroable, Subtarget, DAG);
16236
16237 // Otherwise fall back on generic lowering.
16238 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16239 Subtarget, DAG);
16240}
16241
16242/// Handle lowering of 4-lane 64-bit integer shuffles.
16243///
16244/// This routine is only called when we have AVX2 and thus a reasonable
16245/// instruction set for v4i64 shuffling..
16247 const APInt &Zeroable, SDValue V1, SDValue V2,
16248 const X86Subtarget &Subtarget,
16249 SelectionDAG &DAG) {
16250 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16251 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16252 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16253 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16254
16255 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16256 Subtarget, DAG))
16257 return V;
16258
16259 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16260 Zeroable, Subtarget, DAG))
16261 return Blend;
16262
16263 // Check for being able to broadcast a single element.
16264 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16265 Subtarget, DAG))
16266 return Broadcast;
16267
16268 // Try to use shift instructions if fast.
16269 if (Subtarget.preferLowerShuffleAsShift())
16270 if (SDValue Shift =
16271 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16272 Subtarget, DAG, /*BitwiseOnly*/ true))
16273 return Shift;
16274
16275 if (V2.isUndef()) {
16276 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16277 // can use lower latency instructions that will operate on both lanes.
16278 SmallVector<int, 2> RepeatedMask;
16279 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16280 SmallVector<int, 4> PSHUFDMask;
16281 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16282 return DAG.getBitcast(
16283 MVT::v4i64,
16284 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16285 DAG.getBitcast(MVT::v8i32, V1),
16286 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16287 }
16288
16289 // AVX2 provides a direct instruction for permuting a single input across
16290 // lanes.
16291 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16292 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16293 }
16294
16295 // Try to use shift instructions.
16296 if (SDValue Shift =
16297 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16298 DAG, /*BitwiseOnly*/ false))
16299 return Shift;
16300
16301 // If we have VLX support, we can use VALIGN or VEXPAND.
16302 if (Subtarget.hasVLX()) {
16303 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16304 Zeroable, Subtarget, DAG))
16305 return Rotate;
16306
16307 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16308 Zeroable, Subtarget, DAG))
16309 return V;
16310 }
16311
16312 // Try to use PALIGNR.
16313 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16314 Subtarget, DAG))
16315 return Rotate;
16316
16317 // Use dedicated unpack instructions for masks that match their pattern.
16318 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16319 return V;
16320
16321 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16322 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16323
16324 // If we have one input in place, then we can permute the other input and
16325 // blend the result.
16326 if (V1IsInPlace || V2IsInPlace)
16327 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16328 Zeroable, Subtarget, DAG);
16329
16330 // Try to create an in-lane repeating shuffle mask and then shuffle the
16331 // results into the target lanes.
16333 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16334 return V;
16335
16336 // Try to lower to PERMQ(BLENDD(V1,V2)).
16337 if (SDValue V =
16338 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16339 return V;
16340
16341 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16342 // shuffle. However, if we have AVX2 and either inputs are already in place,
16343 // we will be able to shuffle even across lanes the other input in a single
16344 // instruction so skip this pattern.
16345 if (!V1IsInPlace && !V2IsInPlace)
16347 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16348 return Result;
16349
16350 // Otherwise fall back on generic blend lowering.
16351 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16352 Zeroable, Subtarget, DAG);
16353}
16354
16355/// Handle lowering of 8-lane 32-bit floating point shuffles.
16356///
16357/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16358/// isn't available.
16360 const APInt &Zeroable, SDValue V1, SDValue V2,
16361 const X86Subtarget &Subtarget,
16362 SelectionDAG &DAG) {
16363 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16364 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16365 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16366
16367 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16368 Zeroable, Subtarget, DAG))
16369 return Blend;
16370
16371 // Check for being able to broadcast a single element.
16372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16373 Subtarget, DAG))
16374 return Broadcast;
16375
16376 if (!Subtarget.hasAVX2()) {
16377 SmallVector<int> InLaneMask;
16378 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16379
16380 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16381 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16382 /*SimpleOnly*/ true))
16383 return R;
16384 }
16385 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16386 Zeroable, Subtarget, DAG))
16387 return DAG.getBitcast(MVT::v8f32, ZExt);
16388
16389 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16390 // options to efficiently lower the shuffle.
16391 SmallVector<int, 4> RepeatedMask;
16392 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16393 assert(RepeatedMask.size() == 4 &&
16394 "Repeated masks must be half the mask width!");
16395
16396 // Use even/odd duplicate instructions for masks that match their pattern.
16397 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16398 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16399 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16400 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16401
16402 if (V2.isUndef())
16403 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16404 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16405
16406 // Use dedicated unpack instructions for masks that match their pattern.
16407 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16408 return V;
16409
16410 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16411 // have already handled any direct blends.
16412 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16413 }
16414
16415 // Try to create an in-lane repeating shuffle mask and then shuffle the
16416 // results into the target lanes.
16418 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16419 return V;
16420
16421 // If we have a single input shuffle with different shuffle patterns in the
16422 // two 128-bit lanes use the variable mask to VPERMILPS.
16423 if (V2.isUndef()) {
16424 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16425 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16426 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16427 }
16428 if (Subtarget.hasAVX2()) {
16429 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16430 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16431 }
16432 // Otherwise, fall back.
16433 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16434 DAG, Subtarget);
16435 }
16436
16437 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16438 // shuffle.
16440 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16441 return Result;
16442
16443 // If we have VLX support, we can use VEXPAND.
16444 if (Subtarget.hasVLX())
16445 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16446 Zeroable, Subtarget, DAG))
16447 return V;
16448
16449 // Try to match an interleave of two v8f32s and lower them as unpck and
16450 // permutes using ymms. This needs to go before we try to split the vectors.
16451 //
16452 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16453 // this path inadvertently.
16454 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16455 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16456 Mask, DAG))
16457 return V;
16458
16459 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16460 // since after split we get a more efficient code using vpunpcklwd and
16461 // vpunpckhwd instrs than vblend.
16462 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16463 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16464 Subtarget, DAG);
16465
16466 // If we have AVX2 then we always want to lower with a blend because at v8 we
16467 // can fully permute the elements.
16468 if (Subtarget.hasAVX2())
16469 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16470 Zeroable, Subtarget, DAG);
16471
16472 // Otherwise fall back on generic lowering.
16473 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16474 Subtarget, DAG);
16475}
16476
16477/// Handle lowering of 8-lane 32-bit integer shuffles.
16478///
16479/// This routine is only called when we have AVX2 and thus a reasonable
16480/// instruction set for v8i32 shuffling..
16482 const APInt &Zeroable, SDValue V1, SDValue V2,
16483 const X86Subtarget &Subtarget,
16484 SelectionDAG &DAG) {
16485 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16486 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16487 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16488 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16489
16490 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16491
16492 // Whenever we can lower this as a zext, that instruction is strictly faster
16493 // than any alternative. It also allows us to fold memory operands into the
16494 // shuffle in many cases.
16495 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16496 Zeroable, Subtarget, DAG))
16497 return ZExt;
16498
16499 // Try to match an interleave of two v8i32s and lower them as unpck and
16500 // permutes using ymms. This needs to go before we try to split the vectors.
16501 if (!Subtarget.hasAVX512())
16502 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16503 Mask, DAG))
16504 return V;
16505
16506 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16507 // since after split we get a more efficient code than vblend by using
16508 // vpunpcklwd and vpunpckhwd instrs.
16509 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16510 !Subtarget.hasAVX512())
16511 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16512 Subtarget, DAG);
16513
16514 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16515 Zeroable, Subtarget, DAG))
16516 return Blend;
16517
16518 // Check for being able to broadcast a single element.
16519 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16520 Subtarget, DAG))
16521 return Broadcast;
16522
16523 // Try to use shift instructions if fast.
16524 if (Subtarget.preferLowerShuffleAsShift()) {
16525 if (SDValue Shift =
16526 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16527 Subtarget, DAG, /*BitwiseOnly*/ true))
16528 return Shift;
16529 if (NumV2Elements == 0)
16530 if (SDValue Rotate =
16531 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16532 return Rotate;
16533 }
16534
16535 // If the shuffle mask is repeated in each 128-bit lane we can use more
16536 // efficient instructions that mirror the shuffles across the two 128-bit
16537 // lanes.
16538 SmallVector<int, 4> RepeatedMask;
16539 bool Is128BitLaneRepeatedShuffle =
16540 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16541 if (Is128BitLaneRepeatedShuffle) {
16542 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16543 if (V2.isUndef())
16544 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16545 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16546
16547 // Use dedicated unpack instructions for masks that match their pattern.
16548 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16549 return V;
16550 }
16551
16552 // Try to use shift instructions.
16553 if (SDValue Shift =
16554 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16555 DAG, /*BitwiseOnly*/ false))
16556 return Shift;
16557
16558 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16559 if (SDValue Rotate =
16560 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16561 return Rotate;
16562
16563 // If we have VLX support, we can use VALIGN or EXPAND.
16564 if (Subtarget.hasVLX()) {
16565 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16566 Zeroable, Subtarget, DAG))
16567 return Rotate;
16568
16569 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16570 Zeroable, Subtarget, DAG))
16571 return V;
16572 }
16573
16574 // Try to use byte rotation instructions.
16575 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16576 Subtarget, DAG))
16577 return Rotate;
16578
16579 // Try to create an in-lane repeating shuffle mask and then shuffle the
16580 // results into the target lanes.
16582 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16583 return V;
16584
16585 if (V2.isUndef()) {
16586 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16587 // because that should be faster than the variable permute alternatives.
16588 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16589 return V;
16590
16591 // If the shuffle patterns aren't repeated but it's a single input, directly
16592 // generate a cross-lane VPERMD instruction.
16593 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16594 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16595 }
16596
16597 // Assume that a single SHUFPS is faster than an alternative sequence of
16598 // multiple instructions (even if the CPU has a domain penalty).
16599 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16600 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16601 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16602 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16603 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16604 CastV1, CastV2, DAG);
16605 return DAG.getBitcast(MVT::v8i32, ShufPS);
16606 }
16607
16608 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16609 // shuffle.
16611 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16612 return Result;
16613
16614 // Otherwise fall back on generic blend lowering.
16615 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16616 Zeroable, Subtarget, DAG);
16617}
16618
16619/// Handle lowering of 16-lane 16-bit integer shuffles.
16620///
16621/// This routine is only called when we have AVX2 and thus a reasonable
16622/// instruction set for v16i16 shuffling..
16624 const APInt &Zeroable, SDValue V1, SDValue V2,
16625 const X86Subtarget &Subtarget,
16626 SelectionDAG &DAG) {
16627 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16628 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16629 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16630 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16631
16632 // Whenever we can lower this as a zext, that instruction is strictly faster
16633 // than any alternative. It also allows us to fold memory operands into the
16634 // shuffle in many cases.
16636 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16637 return ZExt;
16638
16639 // Check for being able to broadcast a single element.
16640 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16641 Subtarget, DAG))
16642 return Broadcast;
16643
16644 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16645 Zeroable, Subtarget, DAG))
16646 return Blend;
16647
16648 // Use dedicated unpack instructions for masks that match their pattern.
16649 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16650 return V;
16651
16652 // Use dedicated pack instructions for masks that match their pattern.
16653 if (SDValue V =
16654 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16655 return V;
16656
16657 // Try to use lower using a truncation.
16658 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16659 Subtarget, DAG))
16660 return V;
16661
16662 // Try to use shift instructions.
16663 if (SDValue Shift =
16664 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16665 Subtarget, DAG, /*BitwiseOnly*/ false))
16666 return Shift;
16667
16668 // Try to use byte rotation instructions.
16669 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16670 Subtarget, DAG))
16671 return Rotate;
16672
16673 // Try to create an in-lane repeating shuffle mask and then shuffle the
16674 // results into the target lanes.
16676 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16677 return V;
16678
16679 if (V2.isUndef()) {
16680 // Try to use bit rotation instructions.
16681 if (SDValue Rotate =
16682 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16683 return Rotate;
16684
16685 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16686 // because that should be faster than the variable permute alternatives.
16687 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16688 return V;
16689
16690 // There are no generalized cross-lane shuffle operations available on i16
16691 // element types.
16692 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16694 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16695 return V;
16696
16697 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16698 DAG, Subtarget);
16699 }
16700
16701 SmallVector<int, 8> RepeatedMask;
16702 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16703 // As this is a single-input shuffle, the repeated mask should be
16704 // a strictly valid v8i16 mask that we can pass through to the v8i16
16705 // lowering to handle even the v16 case.
16707 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16708 }
16709 }
16710
16711 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16712 Zeroable, Subtarget, DAG))
16713 return PSHUFB;
16714
16715 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16716 if (Subtarget.hasBWI())
16717 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16718
16719 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16720 // shuffle.
16722 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16723 return Result;
16724
16725 // Try to permute the lanes and then use a per-lane permute.
16727 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16728 return V;
16729
16730 // Try to match an interleave of two v16i16s and lower them as unpck and
16731 // permutes using ymms.
16732 if (!Subtarget.hasAVX512())
16733 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16734 Mask, DAG))
16735 return V;
16736
16737 // Otherwise fall back on generic lowering.
16738 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16739 Subtarget, DAG);
16740}
16741
16742/// Handle lowering of 32-lane 8-bit integer shuffles.
16743///
16744/// This routine is only called when we have AVX2 and thus a reasonable
16745/// instruction set for v32i8 shuffling..
16747 const APInt &Zeroable, SDValue V1, SDValue V2,
16748 const X86Subtarget &Subtarget,
16749 SelectionDAG &DAG) {
16750 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16751 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16752 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16753 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16754
16755 // Whenever we can lower this as a zext, that instruction is strictly faster
16756 // than any alternative. It also allows us to fold memory operands into the
16757 // shuffle in many cases.
16758 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16759 Zeroable, Subtarget, DAG))
16760 return ZExt;
16761
16762 // Check for being able to broadcast a single element.
16763 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16764 Subtarget, DAG))
16765 return Broadcast;
16766
16767 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16768 Zeroable, Subtarget, DAG))
16769 return Blend;
16770
16771 // Use dedicated unpack instructions for masks that match their pattern.
16772 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
16773 return V;
16774
16775 // Use dedicated pack instructions for masks that match their pattern.
16776 if (SDValue V =
16777 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16778 return V;
16779
16780 // Try to use lower using a truncation.
16781 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16782 Subtarget, DAG))
16783 return V;
16784
16785 // Try to use shift instructions.
16786 if (SDValue Shift =
16787 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16788 DAG, /*BitwiseOnly*/ false))
16789 return Shift;
16790
16791 // Try to use byte rotation instructions.
16792 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16793 Subtarget, DAG))
16794 return Rotate;
16795
16796 // Try to use bit rotation instructions.
16797 if (V2.isUndef())
16798 if (SDValue Rotate =
16799 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16800 return Rotate;
16801
16802 // Try to create an in-lane repeating shuffle mask and then shuffle the
16803 // results into the target lanes.
16805 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16806 return V;
16807
16808 // There are no generalized cross-lane shuffle operations available on i8
16809 // element types.
16810 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16811 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16812 // because that should be faster than the variable permute alternatives.
16813 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
16814 return V;
16815
16817 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16818 return V;
16819
16820 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16821 DAG, Subtarget);
16822 }
16823
16824 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16825 Zeroable, Subtarget, DAG))
16826 return PSHUFB;
16827
16828 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16829 if (Subtarget.hasVBMI())
16830 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16831
16832 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16833 // shuffle.
16835 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16836 return Result;
16837
16838 // Try to permute the lanes and then use a per-lane permute.
16840 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16841 return V;
16842
16843 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16844 // by zeroable elements in the remaining 24 elements. Turn this into two
16845 // vmovqb instructions shuffled together.
16846 if (Subtarget.hasVLX())
16847 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16848 Mask, Zeroable, DAG))
16849 return V;
16850
16851 // Try to match an interleave of two v32i8s and lower them as unpck and
16852 // permutes using ymms.
16853 if (!Subtarget.hasAVX512())
16854 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16855 Mask, DAG))
16856 return V;
16857
16858 // Otherwise fall back on generic lowering.
16859 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16860 Subtarget, DAG);
16861}
16862
16863/// High-level routine to lower various 256-bit x86 vector shuffles.
16864///
16865/// This routine either breaks down the specific type of a 256-bit x86 vector
16866/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16867/// together based on the available instructions.
16869 SDValue V1, SDValue V2, const APInt &Zeroable,
16870 const X86Subtarget &Subtarget,
16871 SelectionDAG &DAG) {
16872 // If we have a single input to the zero element, insert that into V1 if we
16873 // can do so cheaply.
16874 int NumElts = VT.getVectorNumElements();
16875 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16876
16877 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16879 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16880 return Insertion;
16881
16882 // Handle special cases where the lower or upper half is UNDEF.
16883 if (SDValue V =
16884 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16885 return V;
16886
16887 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16888 // can check for those subtargets here and avoid much of the subtarget
16889 // querying in the per-vector-type lowering routines. With AVX1 we have
16890 // essentially *zero* ability to manipulate a 256-bit vector with integer
16891 // types. Since we'll use floating point types there eventually, just
16892 // immediately cast everything to a float and operate entirely in that domain.
16893 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16894 int ElementBits = VT.getScalarSizeInBits();
16895 if (ElementBits < 32) {
16896 // No floating point type available, if we can't use the bit operations
16897 // for masking/blending then decompose into 128-bit vectors.
16898 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16899 Subtarget, DAG))
16900 return V;
16901 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16902 return V;
16903 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16904 }
16905
16906 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16908 V1 = DAG.getBitcast(FpVT, V1);
16909 V2 = DAG.getBitcast(FpVT, V2);
16910 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16911 }
16912
16913 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16914 V1 = DAG.getBitcast(MVT::v16i16, V1);
16915 V2 = DAG.getBitcast(MVT::v16i16, V2);
16916 return DAG.getBitcast(VT,
16917 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16918 }
16919
16920 switch (VT.SimpleTy) {
16921 case MVT::v4f64:
16922 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16923 case MVT::v4i64:
16924 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16925 case MVT::v8f32:
16926 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16927 case MVT::v8i32:
16928 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16929 case MVT::v16i16:
16930 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16931 case MVT::v32i8:
16932 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16933
16934 default:
16935 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16936 }
16937}
16938
16939/// Try to lower a vector shuffle as a 128-bit shuffles.
16941 const APInt &Zeroable, SDValue V1, SDValue V2,
16942 const X86Subtarget &Subtarget,
16943 SelectionDAG &DAG) {
16944 assert(VT.getScalarSizeInBits() == 64 &&
16945 "Unexpected element type size for 128bit shuffle.");
16946
16947 // To handle 256 bit vector requires VLX and most probably
16948 // function lowerV2X128VectorShuffle() is better solution.
16949 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16950
16951 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16952 SmallVector<int, 4> Widened128Mask;
16953 if (!canWidenShuffleElements(Mask, Widened128Mask))
16954 return SDValue();
16955 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16956
16957 // Try to use an insert into a zero vector.
16958 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16959 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16960 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16961 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16962 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16963 DAG.getVectorIdxConstant(0, DL));
16964 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16965 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16966 DAG.getVectorIdxConstant(0, DL));
16967 }
16968
16969 // Check for patterns which can be matched with a single insert of a 256-bit
16970 // subvector.
16971 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16972 if (OnlyUsesV1 ||
16973 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16974 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16975 SDValue SubVec =
16976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16977 DAG.getVectorIdxConstant(0, DL));
16978 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16979 DAG.getVectorIdxConstant(4, DL));
16980 }
16981
16982 // See if this is an insertion of the lower 128-bits of V2 into V1.
16983 bool IsInsert = true;
16984 int V2Index = -1;
16985 for (int i = 0; i < 4; ++i) {
16986 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16987 if (Widened128Mask[i] < 0)
16988 continue;
16989
16990 // Make sure all V1 subvectors are in place.
16991 if (Widened128Mask[i] < 4) {
16992 if (Widened128Mask[i] != i) {
16993 IsInsert = false;
16994 break;
16995 }
16996 } else {
16997 // Make sure we only have a single V2 index and its the lowest 128-bits.
16998 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16999 IsInsert = false;
17000 break;
17001 }
17002 V2Index = i;
17003 }
17004 }
17005 if (IsInsert && V2Index >= 0) {
17006 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17007 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17008 DAG.getVectorIdxConstant(0, DL));
17009 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17010 }
17011
17012 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17013 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17014 // possible we at least ensure the lanes stay sequential to help later
17015 // combines.
17016 SmallVector<int, 2> Widened256Mask;
17017 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17018 Widened128Mask.clear();
17019 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17020 }
17021
17022 // Try to lower to vshuf64x2/vshuf32x4.
17023 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17024 int PermMask[4] = {-1, -1, -1, -1};
17025 // Ensure elements came from the same Op.
17026 for (int i = 0; i < 4; ++i) {
17027 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17028 if (Widened128Mask[i] < 0)
17029 continue;
17030
17031 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17032 unsigned OpIndex = i / 2;
17033 if (Ops[OpIndex].isUndef())
17034 Ops[OpIndex] = Op;
17035 else if (Ops[OpIndex] != Op)
17036 return SDValue();
17037
17038 PermMask[i] = Widened128Mask[i] % 4;
17039 }
17040
17041 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17042 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17043}
17044
17045/// Handle lowering of 8-lane 64-bit floating point shuffles.
17047 const APInt &Zeroable, SDValue V1, SDValue V2,
17048 const X86Subtarget &Subtarget,
17049 SelectionDAG &DAG) {
17050 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17051 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17052 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17053
17054 if (V2.isUndef()) {
17055 // Use low duplicate instructions for masks that match their pattern.
17056 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17057 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17058
17059 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17060 // Non-half-crossing single input shuffles can be lowered with an
17061 // interleaved permutation.
17062 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17063 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17064 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17065 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17066 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17067 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17068 }
17069
17070 SmallVector<int, 4> RepeatedMask;
17071 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17072 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17073 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17074 }
17075
17076 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17077 V2, Subtarget, DAG))
17078 return Shuf128;
17079
17080 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17081 return Unpck;
17082
17083 // Check if the blend happens to exactly fit that of SHUFPD.
17084 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17085 Zeroable, Subtarget, DAG))
17086 return Op;
17087
17088 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17089 Subtarget, DAG))
17090 return V;
17091
17092 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17093 Zeroable, Subtarget, DAG))
17094 return Blend;
17095
17096 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17097}
17098
17099/// Handle lowering of 16-lane 32-bit floating point shuffles.
17101 const APInt &Zeroable, SDValue V1, SDValue V2,
17102 const X86Subtarget &Subtarget,
17103 SelectionDAG &DAG) {
17104 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17105 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17106 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17107
17108 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17109 // options to efficiently lower the shuffle.
17110 SmallVector<int, 4> RepeatedMask;
17111 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17112 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17113
17114 // Use even/odd duplicate instructions for masks that match their pattern.
17115 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17116 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17117 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17118 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17119
17120 if (V2.isUndef())
17121 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17122 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17123
17124 // Use dedicated unpack instructions for masks that match their pattern.
17125 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17126 return V;
17127
17128 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17129 Zeroable, Subtarget, DAG))
17130 return Blend;
17131
17132 // Otherwise, fall back to a SHUFPS sequence.
17133 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17134 }
17135
17136 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17137 Zeroable, Subtarget, DAG))
17138 return Blend;
17139
17141 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17142 return DAG.getBitcast(MVT::v16f32, ZExt);
17143
17144 // Try to create an in-lane repeating shuffle mask and then shuffle the
17145 // results into the target lanes.
17147 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17148 return V;
17149
17150 // If we have a single input shuffle with different shuffle patterns in the
17151 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17152 if (V2.isUndef() &&
17153 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17154 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17155 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17156 }
17157
17158 // If we have AVX512F support, we can use VEXPAND.
17159 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17160 Zeroable, Subtarget, DAG))
17161 return V;
17162
17163 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17164}
17165
17166/// Handle lowering of 8-lane 64-bit integer shuffles.
17168 const APInt &Zeroable, SDValue V1, SDValue V2,
17169 const X86Subtarget &Subtarget,
17170 SelectionDAG &DAG) {
17171 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17172 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17173 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17174
17175 // Try to use shift instructions if fast.
17176 if (Subtarget.preferLowerShuffleAsShift())
17177 if (SDValue Shift =
17178 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17179 Subtarget, DAG, /*BitwiseOnly*/ true))
17180 return Shift;
17181
17182 if (V2.isUndef()) {
17183 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17184 // can use lower latency instructions that will operate on all four
17185 // 128-bit lanes.
17186 SmallVector<int, 2> Repeated128Mask;
17187 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17188 SmallVector<int, 4> PSHUFDMask;
17189 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17190 return DAG.getBitcast(
17191 MVT::v8i64,
17192 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17193 DAG.getBitcast(MVT::v16i32, V1),
17194 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17195 }
17196
17197 SmallVector<int, 4> Repeated256Mask;
17198 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17199 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17200 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17201 }
17202
17203 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17204 V2, Subtarget, DAG))
17205 return Shuf128;
17206
17207 // Try to use shift instructions.
17208 if (SDValue Shift =
17209 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17210 DAG, /*BitwiseOnly*/ false))
17211 return Shift;
17212
17213 // Try to use VALIGN.
17214 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17215 Zeroable, Subtarget, DAG))
17216 return Rotate;
17217
17218 // Try to use PALIGNR.
17219 if (Subtarget.hasBWI())
17220 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17221 Subtarget, DAG))
17222 return Rotate;
17223
17224 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17225 return Unpck;
17226
17227 // If we have AVX512F support, we can use VEXPAND.
17228 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17229 Subtarget, DAG))
17230 return V;
17231
17232 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17233 Zeroable, Subtarget, DAG))
17234 return Blend;
17235
17236 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17237}
17238
17239/// Handle lowering of 16-lane 32-bit integer shuffles.
17241 const APInt &Zeroable, SDValue V1, SDValue V2,
17242 const X86Subtarget &Subtarget,
17243 SelectionDAG &DAG) {
17244 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17245 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17246 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17247
17248 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17249
17250 // Whenever we can lower this as a zext, that instruction is strictly faster
17251 // than any alternative. It also allows us to fold memory operands into the
17252 // shuffle in many cases.
17254 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17255 return ZExt;
17256
17257 // Try to use shift instructions if fast.
17258 if (Subtarget.preferLowerShuffleAsShift()) {
17259 if (SDValue Shift =
17260 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17261 Subtarget, DAG, /*BitwiseOnly*/ true))
17262 return Shift;
17263 if (NumV2Elements == 0)
17264 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17265 Subtarget, DAG))
17266 return Rotate;
17267 }
17268
17269 // If the shuffle mask is repeated in each 128-bit lane we can use more
17270 // efficient instructions that mirror the shuffles across the four 128-bit
17271 // lanes.
17272 SmallVector<int, 4> RepeatedMask;
17273 bool Is128BitLaneRepeatedShuffle =
17274 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17275 if (Is128BitLaneRepeatedShuffle) {
17276 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17277 if (V2.isUndef())
17278 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17279 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17280
17281 // Use dedicated unpack instructions for masks that match their pattern.
17282 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17283 return V;
17284 }
17285
17286 // Try to use shift instructions.
17287 if (SDValue Shift =
17288 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17289 Subtarget, DAG, /*BitwiseOnly*/ false))
17290 return Shift;
17291
17292 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17293 if (SDValue Rotate =
17294 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17295 return Rotate;
17296
17297 // Try to use VALIGN.
17298 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17299 Zeroable, Subtarget, DAG))
17300 return Rotate;
17301
17302 // Try to use byte rotation instructions.
17303 if (Subtarget.hasBWI())
17304 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17305 Subtarget, DAG))
17306 return Rotate;
17307
17308 // Assume that a single SHUFPS is faster than using a permv shuffle.
17309 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17310 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17311 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17312 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17313 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17314 CastV1, CastV2, DAG);
17315 return DAG.getBitcast(MVT::v16i32, ShufPS);
17316 }
17317
17318 // Try to create an in-lane repeating shuffle mask and then shuffle the
17319 // results into the target lanes.
17321 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17322 return V;
17323
17324 // If we have AVX512F support, we can use VEXPAND.
17325 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17326 Zeroable, Subtarget, DAG))
17327 return V;
17328
17329 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17330 Zeroable, Subtarget, DAG))
17331 return Blend;
17332
17333 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17334}
17335
17336/// Handle lowering of 32-lane 16-bit integer shuffles.
17338 const APInt &Zeroable, SDValue V1, SDValue V2,
17339 const X86Subtarget &Subtarget,
17340 SelectionDAG &DAG) {
17341 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17342 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17343 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17344 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17345
17346 // Whenever we can lower this as a zext, that instruction is strictly faster
17347 // than any alternative. It also allows us to fold memory operands into the
17348 // shuffle in many cases.
17350 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17351 return ZExt;
17352
17353 // Use dedicated unpack instructions for masks that match their pattern.
17354 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17355 return V;
17356
17357 // Use dedicated pack instructions for masks that match their pattern.
17358 if (SDValue V =
17359 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17360 return V;
17361
17362 // Try to use shift instructions.
17363 if (SDValue Shift =
17364 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17365 Subtarget, DAG, /*BitwiseOnly*/ false))
17366 return Shift;
17367
17368 // Try to use byte rotation instructions.
17369 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17370 Subtarget, DAG))
17371 return Rotate;
17372
17373 if (V2.isUndef()) {
17374 // Try to use bit rotation instructions.
17375 if (SDValue Rotate =
17376 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17377 return Rotate;
17378
17379 SmallVector<int, 8> RepeatedMask;
17380 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17381 // As this is a single-input shuffle, the repeated mask should be
17382 // a strictly valid v8i16 mask that we can pass through to the v8i16
17383 // lowering to handle even the v32 case.
17384 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17385 RepeatedMask, Subtarget, DAG);
17386 }
17387 }
17388
17389 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17390 Zeroable, Subtarget, DAG))
17391 return Blend;
17392
17393 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17394 Zeroable, Subtarget, DAG))
17395 return PSHUFB;
17396
17397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17398 // shuffle.
17399 if (!V2.isUndef())
17401 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17402 return Result;
17403
17404 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17405}
17406
17407/// Handle lowering of 64-lane 8-bit integer shuffles.
17409 const APInt &Zeroable, SDValue V1, SDValue V2,
17410 const X86Subtarget &Subtarget,
17411 SelectionDAG &DAG) {
17412 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17413 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17414 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17415 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17416
17417 // Whenever we can lower this as a zext, that instruction is strictly faster
17418 // than any alternative. It also allows us to fold memory operands into the
17419 // shuffle in many cases.
17421 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17422 return ZExt;
17423
17424 // Use dedicated unpack instructions for masks that match their pattern.
17425 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17426 return V;
17427
17428 // Use dedicated pack instructions for masks that match their pattern.
17429 if (SDValue V =
17430 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17431 return V;
17432
17433 // Try to use shift instructions.
17434 if (SDValue Shift =
17435 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17436 DAG, /*BitwiseOnly*/ false))
17437 return Shift;
17438
17439 // Try to use byte rotation instructions.
17440 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17441 Subtarget, DAG))
17442 return Rotate;
17443
17444 // Try to use bit rotation instructions.
17445 if (V2.isUndef())
17446 if (SDValue Rotate =
17447 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17448 return Rotate;
17449
17450 // Lower as AND if possible.
17451 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17452 Zeroable, Subtarget, DAG))
17453 return Masked;
17454
17455 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17456 Zeroable, Subtarget, DAG))
17457 return PSHUFB;
17458
17459 // Try to create an in-lane repeating shuffle mask and then shuffle the
17460 // results into the target lanes.
17462 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17463 return V;
17464
17466 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17467 return Result;
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17473 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17474 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17475 // PALIGNR will be cheaper than the second PSHUFB+OR.
17476 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17477 Mask, Subtarget, DAG))
17478 return V;
17479
17480 // If we can't directly blend but can use PSHUFB, that will be better as it
17481 // can both shuffle and set up the inefficient blend.
17482 bool V1InUse, V2InUse;
17483 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17484 DAG, V1InUse, V2InUse);
17485 }
17486
17487 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17488 // shuffle.
17489 if (!V2.isUndef())
17491 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17492 return Result;
17493
17494 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17495 if (Subtarget.hasVBMI())
17496 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17497
17498 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17499}
17500
17501/// High-level routine to lower various 512-bit x86 vector shuffles.
17502///
17503/// This routine either breaks down the specific type of a 512-bit x86 vector
17504/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17505/// together based on the available instructions.
17507 MVT VT, SDValue V1, SDValue V2,
17508 const APInt &Zeroable,
17509 const X86Subtarget &Subtarget,
17510 SelectionDAG &DAG) {
17511 assert(Subtarget.hasAVX512() &&
17512 "Cannot lower 512-bit vectors w/ basic ISA!");
17513
17514 // If we have a single input to the zero element, insert that into V1 if we
17515 // can do so cheaply.
17516 int NumElts = Mask.size();
17517 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17518
17519 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17521 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17522 return Insertion;
17523
17524 // Handle special cases where the lower or upper half is UNDEF.
17525 if (SDValue V =
17526 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17527 return V;
17528
17529 // Check for being able to broadcast a single element.
17530 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17531 Subtarget, DAG))
17532 return Broadcast;
17533
17534 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17535 // Try using bit ops for masking and blending before falling back to
17536 // splitting.
17537 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17538 Subtarget, DAG))
17539 return V;
17540 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17541 return V;
17542
17543 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17544 }
17545
17546 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17547 if (!Subtarget.hasBWI())
17548 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17549 /*SimpleOnly*/ false);
17550
17551 V1 = DAG.getBitcast(MVT::v32i16, V1);
17552 V2 = DAG.getBitcast(MVT::v32i16, V2);
17553 return DAG.getBitcast(VT,
17554 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17555 }
17556
17557 // Dispatch to each element type for lowering. If we don't have support for
17558 // specific element type shuffles at 512 bits, immediately split them and
17559 // lower them. Each lowering routine of a given type is allowed to assume that
17560 // the requisite ISA extensions for that element type are available.
17561 switch (VT.SimpleTy) {
17562 case MVT::v8f64:
17563 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17564 case MVT::v16f32:
17565 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17566 case MVT::v8i64:
17567 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17568 case MVT::v16i32:
17569 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17570 case MVT::v32i16:
17571 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17572 case MVT::v64i8:
17573 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17574
17575 default:
17576 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17577 }
17578}
17579
17581 MVT VT, SDValue V1, SDValue V2,
17582 const X86Subtarget &Subtarget,
17583 SelectionDAG &DAG) {
17584 // Shuffle should be unary.
17585 if (!V2.isUndef())
17586 return SDValue();
17587
17588 int ShiftAmt = -1;
17589 int NumElts = Mask.size();
17590 for (int i = 0; i != NumElts; ++i) {
17591 int M = Mask[i];
17592 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17593 "Unexpected mask index.");
17594 if (M < 0)
17595 continue;
17596
17597 // The first non-undef element determines our shift amount.
17598 if (ShiftAmt < 0) {
17599 ShiftAmt = M - i;
17600 // Need to be shifting right.
17601 if (ShiftAmt <= 0)
17602 return SDValue();
17603 }
17604 // All non-undef elements must shift by the same amount.
17605 if (ShiftAmt != M - i)
17606 return SDValue();
17607 }
17608 assert(ShiftAmt >= 0 && "All undef?");
17609
17610 // Great we found a shift right.
17611 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17612 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17613 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17614 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17615 DAG.getVectorIdxConstant(0, DL));
17616}
17617
17618// Determine if this shuffle can be implemented with a KSHIFT instruction.
17619// Returns the shift amount if possible or -1 if not. This is a simplified
17620// version of matchShuffleAsShift.
17621static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17622 int MaskOffset, const APInt &Zeroable) {
17623 int Size = Mask.size();
17624
17625 auto CheckZeros = [&](int Shift, bool Left) {
17626 for (int j = 0; j < Shift; ++j)
17627 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17628 return false;
17629
17630 return true;
17631 };
17632
17633 auto MatchShift = [&](int Shift, bool Left) {
17634 unsigned Pos = Left ? Shift : 0;
17635 unsigned Low = Left ? 0 : Shift;
17636 unsigned Len = Size - Shift;
17637 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17638 };
17639
17640 for (int Shift = 1; Shift != Size; ++Shift)
17641 for (bool Left : {true, false})
17642 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17644 return Shift;
17645 }
17646
17647 return -1;
17648}
17649
17650
17651// Lower vXi1 vector shuffles.
17652// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17653// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17654// vector, shuffle and then truncate it back.
17656 MVT VT, SDValue V1, SDValue V2,
17657 const APInt &Zeroable,
17658 const X86Subtarget &Subtarget,
17659 SelectionDAG &DAG) {
17660 assert(Subtarget.hasAVX512() &&
17661 "Cannot lower 512-bit vectors w/o basic ISA!");
17662
17663 int NumElts = Mask.size();
17664 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17665
17666 // Try to recognize shuffles that are just padding a subvector with zeros.
17667 int SubvecElts = 0;
17668 int Src = -1;
17669 for (int i = 0; i != NumElts; ++i) {
17670 if (Mask[i] >= 0) {
17671 // Grab the source from the first valid mask. All subsequent elements need
17672 // to use this same source.
17673 if (Src < 0)
17674 Src = Mask[i] / NumElts;
17675 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17676 break;
17677 }
17678
17679 ++SubvecElts;
17680 }
17681 assert(SubvecElts != NumElts && "Identity shuffle?");
17682
17683 // Clip to a power 2.
17684 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17685
17686 // Make sure the number of zeroable bits in the top at least covers the bits
17687 // not covered by the subvector.
17688 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17689 assert(Src >= 0 && "Expected a source!");
17690 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17691 SDValue Extract =
17692 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
17693 DAG.getVectorIdxConstant(0, DL));
17694 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17695 DAG.getConstant(0, DL, VT), Extract,
17696 DAG.getVectorIdxConstant(0, DL));
17697 }
17698
17699 // Try a simple shift right with undef elements. Later we'll try with zeros.
17700 if (SDValue Shift =
17701 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
17702 return Shift;
17703
17704 // Try to match KSHIFTs.
17705 unsigned Offset = 0;
17706 for (SDValue V : {V1, V2}) {
17707 unsigned Opcode;
17708 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17709 if (ShiftAmt >= 0) {
17710 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17711 MVT WideVT = Res.getSimpleValueType();
17712 // Widened right shifts need two shifts to ensure we shift in zeroes.
17713 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17714 int WideElts = WideVT.getVectorNumElements();
17715 // Shift left to put the original vector in the MSBs of the new size.
17716 Res =
17717 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17718 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17719 // Increase the shift amount to account for the left shift.
17720 ShiftAmt += WideElts - NumElts;
17721 }
17722
17723 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17724 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17725 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17726 DAG.getVectorIdxConstant(0, DL));
17727 }
17728 Offset += NumElts; // Increment for next iteration.
17729 }
17730
17731 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17732 // ops instead.
17733 // TODO: What other unary shuffles would benefit from this?
17734 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17735 SDValue Op0 = V1.getOperand(0);
17736 SDValue Op1 = V1.getOperand(1);
17737 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17738 EVT OpVT = Op0.getValueType();
17739 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17740 return DAG.getSetCC(
17741 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17742 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17743 }
17744
17745 MVT ExtVT;
17746 switch (VT.SimpleTy) {
17747 default:
17748 llvm_unreachable("Expected a vector of i1 elements");
17749 case MVT::v2i1:
17750 ExtVT = MVT::v2i64;
17751 break;
17752 case MVT::v4i1:
17753 ExtVT = MVT::v4i32;
17754 break;
17755 case MVT::v8i1:
17756 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17757 // shuffle.
17758 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17759 break;
17760 case MVT::v16i1:
17761 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17762 // 256-bit operation available.
17763 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17764 break;
17765 case MVT::v32i1:
17766 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17767 // 256-bit operation available.
17768 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17769 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17770 break;
17771 case MVT::v64i1:
17772 // Fall back to scalarization. FIXME: We can do better if the shuffle
17773 // can be partitioned cleanly.
17774 if (!Subtarget.useBWIRegs())
17775 return SDValue();
17776 ExtVT = MVT::v64i8;
17777 break;
17778 }
17779
17780 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17781 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17782
17783 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17784 // i1 was sign extended we can use X86ISD::CVT2MASK.
17785 int NumElems = VT.getVectorNumElements();
17786 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17787 (Subtarget.hasDQI() && (NumElems < 32)))
17788 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17789 Shuffle, ISD::SETGT);
17790
17791 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17792}
17793
17794/// Helper function that returns true if the shuffle mask should be
17795/// commuted to improve canonicalization.
17797 int NumElements = Mask.size();
17798
17799 int NumV1Elements = 0, NumV2Elements = 0;
17800 for (int M : Mask)
17801 if (M < 0)
17802 continue;
17803 else if (M < NumElements)
17804 ++NumV1Elements;
17805 else
17806 ++NumV2Elements;
17807
17808 // Commute the shuffle as needed such that more elements come from V1 than
17809 // V2. This allows us to match the shuffle pattern strictly on how many
17810 // elements come from V1 without handling the symmetric cases.
17811 if (NumV2Elements > NumV1Elements)
17812 return true;
17813
17814 assert(NumV1Elements > 0 && "No V1 indices");
17815
17816 if (NumV2Elements == 0)
17817 return false;
17818
17819 // When the number of V1 and V2 elements are the same, try to minimize the
17820 // number of uses of V2 in the low half of the vector. When that is tied,
17821 // ensure that the sum of indices for V1 is equal to or lower than the sum
17822 // indices for V2. When those are equal, try to ensure that the number of odd
17823 // indices for V1 is lower than the number of odd indices for V2.
17824 if (NumV1Elements == NumV2Elements) {
17825 int LowV1Elements = 0, LowV2Elements = 0;
17826 for (int M : Mask.slice(0, NumElements / 2))
17827 if (M >= NumElements)
17828 ++LowV2Elements;
17829 else if (M >= 0)
17830 ++LowV1Elements;
17831 if (LowV2Elements > LowV1Elements)
17832 return true;
17833 if (LowV2Elements == LowV1Elements) {
17834 int SumV1Indices = 0, SumV2Indices = 0;
17835 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17836 if (Mask[i] >= NumElements)
17837 SumV2Indices += i;
17838 else if (Mask[i] >= 0)
17839 SumV1Indices += i;
17840 if (SumV2Indices < SumV1Indices)
17841 return true;
17842 if (SumV2Indices == SumV1Indices) {
17843 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17844 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17845 if (Mask[i] >= NumElements)
17846 NumV2OddIndices += i % 2;
17847 else if (Mask[i] >= 0)
17848 NumV1OddIndices += i % 2;
17849 if (NumV2OddIndices < NumV1OddIndices)
17850 return true;
17851 }
17852 }
17853 }
17854
17855 return false;
17856}
17857
17859 const X86Subtarget &Subtarget) {
17860 if (!Subtarget.hasAVX512())
17861 return false;
17862
17863 if (!V.getValueType().isSimple())
17864 return false;
17865
17866 MVT VT = V.getSimpleValueType().getScalarType();
17867 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17868 return false;
17869
17870 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17871 // are preferable to blendw/blendvb/masked-mov.
17872 if ((VT == MVT::i16 || VT == MVT::i8) &&
17873 V.getSimpleValueType().getSizeInBits() < 512)
17874 return false;
17875
17876 auto HasMaskOperation = [&](SDValue V) {
17877 // TODO: Currently we only check limited opcode. We probably extend
17878 // it to all binary operation by checking TLI.isBinOp().
17879 switch (V->getOpcode()) {
17880 default:
17881 return false;
17882 case ISD::ADD:
17883 case ISD::SUB:
17884 case ISD::AND:
17885 case ISD::XOR:
17886 case ISD::OR:
17887 case ISD::SMAX:
17888 case ISD::SMIN:
17889 case ISD::UMAX:
17890 case ISD::UMIN:
17891 case ISD::ABS:
17892 case ISD::SHL:
17893 case ISD::SRL:
17894 case ISD::SRA:
17895 case ISD::MUL:
17896 break;
17897 }
17898 if (!V->hasOneUse())
17899 return false;
17900
17901 return true;
17902 };
17903
17904 if (HasMaskOperation(V))
17905 return true;
17906
17907 return false;
17908}
17909
17910// Forward declaration.
17913 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17914 const X86Subtarget &Subtarget);
17915
17916 /// Top-level lowering for x86 vector shuffles.
17917///
17918/// This handles decomposition, canonicalization, and lowering of all x86
17919/// vector shuffles. Most of the specific lowering strategies are encapsulated
17920/// above in helper routines. The canonicalization attempts to widen shuffles
17921/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17922/// s.t. only one of the two inputs needs to be tested, etc.
17924 SelectionDAG &DAG) {
17925 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17926 ArrayRef<int> OrigMask = SVOp->getMask();
17927 SDValue V1 = Op.getOperand(0);
17928 SDValue V2 = Op.getOperand(1);
17929 MVT VT = Op.getSimpleValueType();
17930 int NumElements = VT.getVectorNumElements();
17931 SDLoc DL(Op);
17932 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17933
17934 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17935 "Can't lower MMX shuffles");
17936
17937 bool V1IsUndef = V1.isUndef();
17938 bool V2IsUndef = V2.isUndef();
17939 if (V1IsUndef && V2IsUndef)
17940 return DAG.getUNDEF(VT);
17941
17942 // When we create a shuffle node we put the UNDEF node to second operand,
17943 // but in some cases the first operand may be transformed to UNDEF.
17944 // In this case we should just commute the node.
17945 if (V1IsUndef)
17946 return DAG.getCommutedVectorShuffle(*SVOp);
17947
17948 // Check for non-undef masks pointing at an undef vector and make the masks
17949 // undef as well. This makes it easier to match the shuffle based solely on
17950 // the mask.
17951 if (V2IsUndef &&
17952 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17953 SmallVector<int, 8> NewMask(OrigMask);
17954 for (int &M : NewMask)
17955 if (M >= NumElements)
17956 M = -1;
17957 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17958 }
17959
17960 // Check for illegal shuffle mask element index values.
17961 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17962 (void)MaskUpperLimit;
17963 assert(llvm::all_of(OrigMask,
17964 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17965 "Out of bounds shuffle index");
17966
17967 // We actually see shuffles that are entirely re-arrangements of a set of
17968 // zero inputs. This mostly happens while decomposing complex shuffles into
17969 // simple ones. Directly lower these as a buildvector of zeros.
17970 APInt KnownUndef, KnownZero;
17971 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17972
17973 APInt Zeroable = KnownUndef | KnownZero;
17974 if (Zeroable.isAllOnes())
17975 return getZeroVector(VT, Subtarget, DAG, DL);
17976
17977 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17978
17979 // Try to collapse shuffles into using a vector type with fewer elements but
17980 // wider element types. We cap this to not form integers or floating point
17981 // elements wider than 64 bits. It does not seem beneficial to form i128
17982 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17983 SmallVector<int, 16> WidenedMask;
17984 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17985 !canCombineAsMaskOperation(V1, Subtarget) &&
17986 !canCombineAsMaskOperation(V2, Subtarget) &&
17987 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17988 // Shuffle mask widening should not interfere with a broadcast opportunity
17989 // by obfuscating the operands with bitcasts.
17990 // TODO: Avoid lowering directly from this top-level function: make this
17991 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17992 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17993 Subtarget, DAG))
17994 return Broadcast;
17995
17996 MVT NewEltVT = VT.isFloatingPoint()
17999 int NewNumElts = NumElements / 2;
18000 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18001 // Make sure that the new vector type is legal. For example, v2f64 isn't
18002 // legal on SSE1.
18003 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18004 if (V2IsZero) {
18005 // Modify the new Mask to take all zeros from the all-zero vector.
18006 // Choose indices that are blend-friendly.
18007 bool UsedZeroVector = false;
18008 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18009 "V2's non-undef elements are used?!");
18010 for (int i = 0; i != NewNumElts; ++i)
18011 if (WidenedMask[i] == SM_SentinelZero) {
18012 WidenedMask[i] = i + NewNumElts;
18013 UsedZeroVector = true;
18014 }
18015 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18016 // some elements to be undef.
18017 if (UsedZeroVector)
18018 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18019 }
18020 V1 = DAG.getBitcast(NewVT, V1);
18021 V2 = DAG.getBitcast(NewVT, V2);
18022 return DAG.getBitcast(
18023 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18024 }
18025 }
18026
18027 SmallVector<SDValue> Ops = {V1, V2};
18028 SmallVector<int> Mask(OrigMask);
18029
18030 // Canonicalize the shuffle with any horizontal ops inputs.
18031 // NOTE: This may update Ops and Mask.
18033 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18034 return DAG.getBitcast(VT, HOp);
18035
18036 V1 = DAG.getBitcast(VT, Ops[0]);
18037 V2 = DAG.getBitcast(VT, Ops[1]);
18038 assert(NumElements == (int)Mask.size() &&
18039 "canonicalizeShuffleMaskWithHorizOp "
18040 "shouldn't alter the shuffle mask size");
18041
18042 // Commute the shuffle if it will improve canonicalization.
18045 std::swap(V1, V2);
18046 }
18047
18048 // For each vector width, delegate to a specialized lowering routine.
18049 if (VT.is128BitVector())
18050 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18051
18052 if (VT.is256BitVector())
18053 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18054
18055 if (VT.is512BitVector())
18056 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18057
18058 if (Is1BitVector)
18059 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18060
18061 llvm_unreachable("Unimplemented!");
18062}
18063
18064// As legal vpcompress instructions depend on various AVX512 extensions, try to
18065// convert illegal vector sizes to legal ones to avoid expansion.
18067 SelectionDAG &DAG) {
18068 assert(Subtarget.hasAVX512() &&
18069 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18070
18071 SDLoc DL(Op);
18072 SDValue Vec = Op.getOperand(0);
18073 SDValue Mask = Op.getOperand(1);
18074 SDValue Passthru = Op.getOperand(2);
18075
18076 EVT VecVT = Vec.getValueType();
18077 EVT ElementVT = VecVT.getVectorElementType();
18078 unsigned NumElements = VecVT.getVectorNumElements();
18079 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18080 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18081
18082 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18083 // compressed as 512-bit vectors in AVX512F.
18084 if (NumVecBits != 128 && NumVecBits != 256)
18085 return SDValue();
18086
18087 if (NumElementBits == 32 || NumElementBits == 64) {
18088 unsigned NumLargeElements = 512 / NumElementBits;
18089 MVT LargeVecVT =
18090 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18091 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18092
18093 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18094 DAG, DL);
18095 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18096 Subtarget, DAG, DL);
18097 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18098 : widenSubVector(LargeVecVT, Passthru,
18099 /*ZeroNewElements=*/false,
18100 Subtarget, DAG, DL);
18101
18102 SDValue Compressed =
18103 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18104 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18105 DAG.getConstant(0, DL, MVT::i64));
18106 }
18107
18108 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18109 VecVT == MVT::v16i16) {
18110 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18111 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18112
18113 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18114 Passthru = Passthru.isUndef()
18115 ? DAG.getUNDEF(LargeVecVT)
18116 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18117
18118 SDValue Compressed =
18119 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18120 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18121 }
18122
18123 return SDValue();
18124}
18125
18126/// Try to lower a VSELECT instruction to a vector shuffle.
18128 const X86Subtarget &Subtarget,
18129 SelectionDAG &DAG) {
18130 SDValue Cond = Op.getOperand(0);
18131 SDValue LHS = Op.getOperand(1);
18132 SDValue RHS = Op.getOperand(2);
18133 MVT VT = Op.getSimpleValueType();
18134
18135 // Only non-legal VSELECTs reach this lowering, convert those into generic
18136 // shuffles and re-use the shuffle lowering path for blends.
18140 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18141 }
18142
18143 return SDValue();
18144}
18145
18146SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18147 SDValue Cond = Op.getOperand(0);
18148 SDValue LHS = Op.getOperand(1);
18149 SDValue RHS = Op.getOperand(2);
18150
18151 SDLoc dl(Op);
18152 MVT VT = Op.getSimpleValueType();
18153 if (isSoftF16(VT, Subtarget)) {
18155 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18156 DAG.getBitcast(NVT, LHS),
18157 DAG.getBitcast(NVT, RHS)));
18158 }
18159
18160 // A vselect where all conditions and data are constants can be optimized into
18161 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18165 return SDValue();
18166
18167 // Try to lower this to a blend-style vector shuffle. This can handle all
18168 // constant condition cases.
18169 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18170 return BlendOp;
18171
18172 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18173 // with patterns on the mask registers on AVX-512.
18174 MVT CondVT = Cond.getSimpleValueType();
18175 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18176 if (CondEltSize == 1)
18177 return Op;
18178
18179 // Variable blends are only legal from SSE4.1 onward.
18180 if (!Subtarget.hasSSE41())
18181 return SDValue();
18182
18183 unsigned EltSize = VT.getScalarSizeInBits();
18184 unsigned NumElts = VT.getVectorNumElements();
18185
18186 // Expand v32i16/v64i8 without BWI.
18187 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18188 return SDValue();
18189
18190 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18191 // into an i1 condition so that we can use the mask-based 512-bit blend
18192 // instructions.
18193 if (VT.getSizeInBits() == 512) {
18194 // Build a mask by testing the condition against zero.
18195 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18196 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18197 DAG.getConstant(0, dl, CondVT),
18198 ISD::SETNE);
18199 // Now return a new VSELECT using the mask.
18200 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18201 }
18202
18203 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18204 if (CondEltSize != EltSize) {
18205 // If we don't have a sign splat, rely on the expansion.
18206 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18207 return SDValue();
18208
18209 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18210 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18211 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18212 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18213 }
18214
18215 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18216 // are free to split, then better to split before expanding the
18217 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18218 // TODO: This is very similar to narrowVectorSelect.
18219 // TODO: Add Load splitting to isFreeToSplitVector ?
18220 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18221 !Subtarget.hasXOP()) {
18222 bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
18223 bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
18224 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18225 bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
18226 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18227 if (FreeCond && (FreeLHS || FreeRHS))
18228 return splitVectorOp(Op, DAG, dl);
18229 }
18230
18231 // Only some types will be legal on some subtargets. If we can emit a legal
18232 // VSELECT-matching blend, return Op, and but if we need to expand, return
18233 // a null value.
18234 switch (VT.SimpleTy) {
18235 default:
18236 // Most of the vector types have blends past SSE4.1.
18237 return Op;
18238
18239 case MVT::v32i8:
18240 // The byte blends for AVX vectors were introduced only in AVX2.
18241 if (Subtarget.hasAVX2())
18242 return Op;
18243
18244 return SDValue();
18245
18246 case MVT::v8i16:
18247 case MVT::v16i16: {
18248 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18249 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18250 Cond = DAG.getBitcast(CastVT, Cond);
18251 LHS = DAG.getBitcast(CastVT, LHS);
18252 RHS = DAG.getBitcast(CastVT, RHS);
18253 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18254 return DAG.getBitcast(VT, Select);
18255 }
18256 }
18257}
18258
18260 MVT VT = Op.getSimpleValueType();
18261 SDValue Vec = Op.getOperand(0);
18262 SDValue Idx = Op.getOperand(1);
18263 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18264 SDLoc dl(Op);
18265
18267 return SDValue();
18268
18269 if (VT.getSizeInBits() == 8) {
18270 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18271 // we're going to zero extend the register or fold the store.
18274 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18275 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18276 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18277
18278 unsigned IdxVal = Idx->getAsZExtVal();
18279 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18280 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18281 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18282 }
18283
18284 if (VT == MVT::f32) {
18285 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18286 // the result back to FR32 register. It's only worth matching if the
18287 // result has a single use which is a store or a bitcast to i32. And in
18288 // the case of a store, it's not worth it if the index is a constant 0,
18289 // because a MOVSSmr can be used instead, which is smaller and faster.
18290 if (!Op.hasOneUse())
18291 return SDValue();
18292 SDNode *User = *Op.getNode()->user_begin();
18293 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18294 (User->getOpcode() != ISD::BITCAST ||
18295 User->getValueType(0) != MVT::i32))
18296 return SDValue();
18297 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18298 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18299 return DAG.getBitcast(MVT::f32, Extract);
18300 }
18301
18302 if (VT == MVT::i32 || VT == MVT::i64)
18303 return Op;
18304
18305 return SDValue();
18306}
18307
18308/// Extract one bit from mask vector, like v16i1 or v8i1.
18309/// AVX-512 feature.
18311 const X86Subtarget &Subtarget) {
18312 SDValue Vec = Op.getOperand(0);
18313 SDLoc dl(Vec);
18314 MVT VecVT = Vec.getSimpleValueType();
18315 SDValue Idx = Op.getOperand(1);
18316 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18317 MVT EltVT = Op.getSimpleValueType();
18318
18319 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18320 "Unexpected vector type in ExtractBitFromMaskVector");
18321
18322 // variable index can't be handled in mask registers,
18323 // extend vector to VR512/128
18324 if (!IdxC) {
18325 unsigned NumElts = VecVT.getVectorNumElements();
18326 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18327 // than extending to 128/256bit.
18328 if (NumElts == 1) {
18329 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18331 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18332 }
18333 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18334 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18335 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18336 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18337 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18338 }
18339
18340 unsigned IdxVal = IdxC->getZExtValue();
18341 if (IdxVal == 0) // the operation is legal
18342 return Op;
18343
18344 // Extend to natively supported kshift.
18345 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18346
18347 // Use kshiftr instruction to move to the lower element.
18348 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18349 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18350
18351 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18352 DAG.getVectorIdxConstant(0, dl));
18353}
18354
18355// Helper to find all the extracted elements from a vector.
18357 MVT VT = N->getSimpleValueType(0);
18358 unsigned NumElts = VT.getVectorNumElements();
18359 APInt DemandedElts = APInt::getZero(NumElts);
18360 for (SDNode *User : N->users()) {
18361 switch (User->getOpcode()) {
18362 case X86ISD::PEXTRB:
18363 case X86ISD::PEXTRW:
18365 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18366 DemandedElts.setAllBits();
18367 return DemandedElts;
18368 }
18369 DemandedElts.setBit(User->getConstantOperandVal(1));
18370 break;
18371 case ISD::BITCAST: {
18372 if (!User->getValueType(0).isSimple() ||
18373 !User->getValueType(0).isVector()) {
18374 DemandedElts.setAllBits();
18375 return DemandedElts;
18376 }
18377 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18378 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18379 break;
18380 }
18381 default:
18382 DemandedElts.setAllBits();
18383 return DemandedElts;
18384 }
18385 }
18386 return DemandedElts;
18387}
18388
18389SDValue
18390X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18391 SelectionDAG &DAG) const {
18392 SDLoc dl(Op);
18393 SDValue Vec = Op.getOperand(0);
18394 MVT VecVT = Vec.getSimpleValueType();
18395 SDValue Idx = Op.getOperand(1);
18396 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18397
18398 if (VecVT.getVectorElementType() == MVT::i1)
18399 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18400
18401 if (!IdxC) {
18402 // Its more profitable to go through memory (1 cycles throughput)
18403 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18404 // IACA tool was used to get performance estimation
18405 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18406 //
18407 // example : extractelement <16 x i8> %a, i32 %i
18408 //
18409 // Block Throughput: 3.00 Cycles
18410 // Throughput Bottleneck: Port5
18411 //
18412 // | Num Of | Ports pressure in cycles | |
18413 // | Uops | 0 - DV | 5 | 6 | 7 | |
18414 // ---------------------------------------------
18415 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18416 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18417 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18418 // Total Num Of Uops: 4
18419 //
18420 //
18421 // Block Throughput: 1.00 Cycles
18422 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18423 //
18424 // | | Ports pressure in cycles | |
18425 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18426 // ---------------------------------------------------------
18427 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18428 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18429 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18430 // Total Num Of Uops: 4
18431
18432 return SDValue();
18433 }
18434
18435 unsigned IdxVal = IdxC->getZExtValue();
18436
18437 // If this is a 256-bit vector result, first extract the 128-bit vector and
18438 // then extract the element from the 128-bit vector.
18439 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18440 // Get the 128-bit vector.
18441 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18442 MVT EltVT = VecVT.getVectorElementType();
18443
18444 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18445 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18446
18447 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18448 // this can be done with a mask.
18449 IdxVal &= ElemsPerChunk - 1;
18450 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18451 DAG.getVectorIdxConstant(IdxVal, dl));
18452 }
18453
18454 assert(VecVT.is128BitVector() && "Unexpected vector length");
18455
18456 MVT VT = Op.getSimpleValueType();
18457
18458 if (VT == MVT::i16) {
18459 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18460 // we're going to zero extend the register or fold the store (SSE41 only).
18461 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18462 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18463 if (Subtarget.hasFP16())
18464 return Op;
18465
18466 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18467 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18468 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18469 }
18470
18471 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18472 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18473 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18474 }
18475
18476 if (Subtarget.hasSSE41())
18477 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18478 return Res;
18479
18480 // Only extract a single element from a v16i8 source - determine the common
18481 // DWORD/WORD that all extractions share, and extract the sub-byte.
18482 // TODO: Add QWORD MOVQ extraction?
18483 if (VT == MVT::i8) {
18484 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18485 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18486
18487 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18488 int DWordIdx = IdxVal / 4;
18489 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18490 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18491 DAG.getBitcast(MVT::v4i32, Vec),
18492 DAG.getVectorIdxConstant(DWordIdx, dl));
18493 int ShiftVal = (IdxVal % 4) * 8;
18494 if (ShiftVal != 0)
18495 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18496 DAG.getConstant(ShiftVal, dl, MVT::i8));
18497 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18498 }
18499
18500 int WordIdx = IdxVal / 2;
18501 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18502 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18503 DAG.getBitcast(MVT::v8i16, Vec),
18504 DAG.getVectorIdxConstant(WordIdx, dl));
18505 int ShiftVal = (IdxVal % 2) * 8;
18506 if (ShiftVal != 0)
18507 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18508 DAG.getConstant(ShiftVal, dl, MVT::i8));
18509 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18510 }
18511 }
18512
18513 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18514 if (IdxVal == 0)
18515 return Op;
18516
18517 // Shuffle the element to the lowest element, then movss or movsh.
18519 Mask[0] = static_cast<int>(IdxVal);
18520 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18521 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18522 DAG.getVectorIdxConstant(0, dl));
18523 }
18524
18525 if (VT.getSizeInBits() == 64) {
18526 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18527 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18528 // to match extract_elt for f64.
18529 if (IdxVal == 0)
18530 return Op;
18531
18532 // UNPCKHPD the element to the lowest double word, then movsd.
18533 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18534 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18535 int Mask[2] = { 1, -1 };
18536 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18537 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18538 DAG.getVectorIdxConstant(0, dl));
18539 }
18540
18541 return SDValue();
18542}
18543
18544/// Insert one bit to mask vector, like v16i1 or v8i1.
18545/// AVX-512 feature.
18547 const X86Subtarget &Subtarget) {
18548 SDLoc dl(Op);
18549 SDValue Vec = Op.getOperand(0);
18550 SDValue Elt = Op.getOperand(1);
18551 SDValue Idx = Op.getOperand(2);
18552 MVT VecVT = Vec.getSimpleValueType();
18553
18554 if (!isa<ConstantSDNode>(Idx)) {
18555 // Non constant index. Extend source and destination,
18556 // insert element and then truncate the result.
18557 unsigned NumElts = VecVT.getVectorNumElements();
18558 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18559 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18560 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18561 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18562 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18563 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18564 }
18565
18566 // Copy into a k-register, extract to v1i1 and insert_subvector.
18567 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18568 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18569}
18570
18571SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18572 SelectionDAG &DAG) const {
18573 MVT VT = Op.getSimpleValueType();
18574 MVT EltVT = VT.getVectorElementType();
18575 unsigned NumElts = VT.getVectorNumElements();
18576 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18577
18578 if (EltVT == MVT::i1)
18579 return InsertBitToMaskVector(Op, DAG, Subtarget);
18580
18581 SDLoc dl(Op);
18582 SDValue N0 = Op.getOperand(0);
18583 SDValue N1 = Op.getOperand(1);
18584 SDValue N2 = Op.getOperand(2);
18585 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18586
18587 if (EltVT == MVT::bf16) {
18589 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18590 DAG.getBitcast(IVT, N0),
18591 DAG.getBitcast(MVT::i16, N1), N2);
18592 return DAG.getBitcast(VT, Res);
18593 }
18594
18595 if (!N2C) {
18596 // Variable insertion indices, usually we're better off spilling to stack,
18597 // but AVX512 can use a variable compare+select by comparing against all
18598 // possible vector indices, and FP insertion has less gpr->simd traffic.
18599 if (!(Subtarget.hasBWI() ||
18600 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18601 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18602 return SDValue();
18603
18604 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18605 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18606 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18607 return SDValue();
18608
18609 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18610 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18611 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18612
18613 SmallVector<SDValue, 16> RawIndices;
18614 for (unsigned I = 0; I != NumElts; ++I)
18615 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18616 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18617
18618 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18619 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18621 }
18622
18623 if (N2C->getAPIntValue().uge(NumElts))
18624 return SDValue();
18625 uint64_t IdxVal = N2C->getZExtValue();
18626
18627 bool IsZeroElt = X86::isZeroNode(N1);
18628 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18629
18630 if (IsZeroElt || IsAllOnesElt) {
18631 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18632 // We don't deal with i8 0 since it appears to be handled elsewhere.
18633 if (IsAllOnesElt &&
18634 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18635 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18636 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18637 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18638 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18639 CstVectorElts[IdxVal] = OnesCst;
18640 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18641 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18642 }
18643 // See if we can do this more efficiently with a blend shuffle with a
18644 // rematerializable vector.
18645 if (Subtarget.hasSSE41() &&
18646 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18647 SmallVector<int, 8> BlendMask;
18648 for (unsigned i = 0; i != NumElts; ++i)
18649 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18650 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18651 : getOnesVector(VT, DAG, dl);
18652 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18653 }
18654 }
18655
18656 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18657 // into that, and then insert the subvector back into the result.
18658 if (VT.is256BitVector() || VT.is512BitVector()) {
18659 // With a 256-bit vector, we can insert into the zero element efficiently
18660 // using a blend if we have AVX or AVX2 and the right data type.
18661 if (VT.is256BitVector() && IdxVal == 0) {
18662 // TODO: It is worthwhile to cast integer to floating point and back
18663 // and incur a domain crossing penalty if that's what we'll end up
18664 // doing anyway after extracting to a 128-bit vector.
18665 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18666 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18667 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18668 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18669 DAG.getTargetConstant(1, dl, MVT::i8));
18670 }
18671 }
18672
18673 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18674 assert(isPowerOf2_32(NumEltsIn128) &&
18675 "Vectors will always have power-of-two number of elements.");
18676
18677 // If we are not inserting into the low 128-bit vector chunk,
18678 // then prefer the broadcast+blend sequence.
18679 // FIXME: relax the profitability check iff all N1 uses are insertions.
18680 if (IdxVal >= NumEltsIn128 &&
18681 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18682 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18683 X86::mayFoldLoad(N1, Subtarget)))) {
18684 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18685 SmallVector<int, 8> BlendMask;
18686 for (unsigned i = 0; i != NumElts; ++i)
18687 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18688 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18689 }
18690
18691 // Get the desired 128-bit vector chunk.
18692 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18693
18694 // Insert the element into the desired chunk.
18695 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18696 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18697
18698 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18699 DAG.getVectorIdxConstant(IdxIn128, dl));
18700
18701 // Insert the changed part back into the bigger vector
18702 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18703 }
18704 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18705
18706 // This will be just movw/movd/movq/movsh/movss/movsd.
18707 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18708 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18709 EltVT == MVT::f16 || EltVT == MVT::i64) {
18710 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18711 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18712 }
18713
18714 // We can't directly insert an i8 or i16 into a vector, so zero extend
18715 // it to i32 first.
18716 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18717 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18718 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18719 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18720 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18721 return DAG.getBitcast(VT, N1);
18722 }
18723 }
18724
18725 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18726 // argument. SSE41 required for pinsrb.
18727 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18728 unsigned Opc;
18729 if (VT == MVT::v8i16) {
18730 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18731 Opc = X86ISD::PINSRW;
18732 } else {
18733 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18734 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18735 Opc = X86ISD::PINSRB;
18736 }
18737
18738 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18739 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18740 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18741 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18742 }
18743
18744 if (Subtarget.hasSSE41()) {
18745 if (EltVT == MVT::f32) {
18746 // Bits [7:6] of the constant are the source select. This will always be
18747 // zero here. The DAG Combiner may combine an extract_elt index into
18748 // these bits. For example (insert (extract, 3), 2) could be matched by
18749 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18750 // Bits [5:4] of the constant are the destination select. This is the
18751 // value of the incoming immediate.
18752 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18753 // combine either bitwise AND or insert of float 0.0 to set these bits.
18754
18755 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18756 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18757 // If this is an insertion of 32-bits into the low 32-bits of
18758 // a vector, we prefer to generate a blend with immediate rather
18759 // than an insertps. Blends are simpler operations in hardware and so
18760 // will always have equal or better performance than insertps.
18761 // But if optimizing for size and there's a load folding opportunity,
18762 // generate insertps because blendps does not have a 32-bit memory
18763 // operand form.
18764 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18765 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18766 DAG.getTargetConstant(1, dl, MVT::i8));
18767 }
18768 // Create this as a scalar to vector..
18769 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18770 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18771 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18772 }
18773
18774 // PINSR* works with constant index.
18775 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18776 return Op;
18777 }
18778
18779 return SDValue();
18780}
18781
18783 SelectionDAG &DAG) {
18784 SDLoc dl(Op);
18785 MVT OpVT = Op.getSimpleValueType();
18786
18787 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18788 // combines.
18789 if (X86::isZeroNode(Op.getOperand(0)))
18790 return getZeroVector(OpVT, Subtarget, DAG, dl);
18791
18792 // If this is a 256-bit vector result, first insert into a 128-bit
18793 // vector and then insert into the 256-bit vector.
18794 if (!OpVT.is128BitVector()) {
18795 // Insert into a 128-bit vector.
18796 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18798 OpVT.getVectorNumElements() / SizeFactor);
18799
18800 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18801
18802 // Insert the 128-bit vector.
18803 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18804 }
18805 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18806 "Expected an SSE type!");
18807
18808 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18809 // tblgen.
18810 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18811 return Op;
18812
18813 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18814 return DAG.getBitcast(
18815 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18816}
18817
18818// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18819// simple superregister reference or explicit instructions to insert
18820// the upper bits of a vector.
18822 SelectionDAG &DAG) {
18823 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18824
18825 return insert1BitVector(Op, DAG, Subtarget);
18826}
18827
18829 SelectionDAG &DAG) {
18830 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18831 "Only vXi1 extract_subvectors need custom lowering");
18832
18833 SDLoc dl(Op);
18834 SDValue Vec = Op.getOperand(0);
18835 uint64_t IdxVal = Op.getConstantOperandVal(1);
18836
18837 if (IdxVal == 0) // the operation is legal
18838 return Op;
18839
18840 // Extend to natively supported kshift.
18841 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18842
18843 // Shift to the LSB.
18844 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18845 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18846
18847 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18848 DAG.getVectorIdxConstant(0, dl));
18849}
18850
18851// Returns the appropriate wrapper opcode for a global reference.
18852unsigned X86TargetLowering::getGlobalWrapperKind(
18853 const GlobalValue *GV, const unsigned char OpFlags) const {
18854 // References to absolute symbols are never PC-relative.
18855 if (GV && GV->isAbsoluteSymbolRef())
18856 return X86ISD::Wrapper;
18857
18858 // The following OpFlags under RIP-rel PIC use RIP.
18859 if (Subtarget.isPICStyleRIPRel() &&
18860 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18861 OpFlags == X86II::MO_DLLIMPORT))
18862 return X86ISD::WrapperRIP;
18863
18864 // GOTPCREL references must always use RIP.
18865 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18866 return X86ISD::WrapperRIP;
18867
18868 return X86ISD::Wrapper;
18869}
18870
18871// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18872// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18873// one of the above mentioned nodes. It has to be wrapped because otherwise
18874// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18875// be used to form addressing mode. These wrapped nodes will be selected
18876// into MOV32ri.
18877SDValue
18878X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18879 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18880
18881 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18882 // global base reg.
18883 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18884
18885 auto PtrVT = getPointerTy(DAG.getDataLayout());
18887 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18888 SDLoc DL(CP);
18889 Result =
18890 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18891 // With PIC, the address is actually $g + Offset.
18892 if (OpFlag) {
18893 Result =
18894 DAG.getNode(ISD::ADD, DL, PtrVT,
18895 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18896 }
18897
18898 return Result;
18899}
18900
18901SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18902 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18903
18904 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18905 // global base reg.
18906 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18907
18908 auto PtrVT = getPointerTy(DAG.getDataLayout());
18909 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18910 SDLoc DL(JT);
18911 Result =
18912 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18913
18914 // With PIC, the address is actually $g + Offset.
18915 if (OpFlag)
18916 Result =
18917 DAG.getNode(ISD::ADD, DL, PtrVT,
18918 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18919
18920 return Result;
18921}
18922
18923SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18924 SelectionDAG &DAG) const {
18925 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18926}
18927
18928SDValue
18929X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18930 // Create the TargetBlockAddressAddress node.
18931 unsigned char OpFlags =
18933 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18934 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18935 SDLoc dl(Op);
18936 auto PtrVT = getPointerTy(DAG.getDataLayout());
18937 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18938 Result =
18939 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18940
18941 // With PIC, the address is actually $g + Offset.
18942 if (isGlobalRelativeToPICBase(OpFlags)) {
18943 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18944 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18945 }
18946
18947 return Result;
18948}
18949
18950/// Creates target global address or external symbol nodes for calls or
18951/// other uses.
18952SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18953 bool ForCall) const {
18954 // Unpack the global address or external symbol.
18955 SDLoc dl(Op);
18956 const GlobalValue *GV = nullptr;
18957 int64_t Offset = 0;
18958 const char *ExternalSym = nullptr;
18959 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18960 GV = G->getGlobal();
18961 Offset = G->getOffset();
18962 } else {
18963 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18964 ExternalSym = ES->getSymbol();
18965 }
18966
18967 // Calculate some flags for address lowering.
18969 unsigned char OpFlags;
18970 if (ForCall)
18971 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18972 else
18973 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18974 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18975 bool NeedsLoad = isGlobalStubReference(OpFlags);
18976
18978 auto PtrVT = getPointerTy(DAG.getDataLayout());
18980
18981 if (GV) {
18982 // Create a target global address if this is a global. If possible, fold the
18983 // offset into the global address reference. Otherwise, ADD it on later.
18984 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18985 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18986 // relocation will compute to a negative value, which is invalid.
18987 int64_t GlobalOffset = 0;
18988 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18990 std::swap(GlobalOffset, Offset);
18991 }
18992 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18993 } else {
18994 // If this is not a global address, this must be an external symbol.
18995 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18996 }
18997
18998 // If this is a direct call, avoid the wrapper if we don't need to do any
18999 // loads or adds. This allows SDAG ISel to match direct calls.
19000 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19001 return Result;
19002
19003 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19004
19005 // With PIC, the address is actually $g + Offset.
19006 if (HasPICReg) {
19007 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19008 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19009 }
19010
19011 // For globals that require a load from a stub to get the address, emit the
19012 // load.
19013 if (NeedsLoad)
19014 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19016
19017 // If there was a non-zero offset that we didn't fold, create an explicit
19018 // addition for it.
19019 if (Offset != 0)
19020 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19021 DAG.getSignedConstant(Offset, dl, PtrVT));
19022
19023 return Result;
19024}
19025
19026SDValue
19027X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19028 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19029}
19030
19032 const EVT PtrVT, unsigned ReturnReg,
19033 unsigned char OperandFlags,
19034 bool LoadGlobalBaseReg = false,
19035 bool LocalDynamic = false) {
19037 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19038 SDLoc dl(GA);
19039 SDValue TGA;
19040 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19041 SDValue Chain = DAG.getEntryNode();
19042 SDValue Ret;
19043 if (LocalDynamic && UseTLSDESC) {
19044 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19045 // Reuse existing GetTLSADDR node if we can find it.
19046 if (TGA->hasOneUse()) {
19047 // TLSDESC uses TGA.
19048 SDNode *TLSDescOp = *TGA->user_begin();
19049 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19050 "Unexpected TLSDESC DAG");
19051 // CALLSEQ_END uses TGA via a chain and glue.
19052 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19053 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19054 "Unexpected TLSDESC DAG");
19055 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19056 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19057 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19058 "Unexpected TLSDESC DAG");
19059 Ret = SDValue(CopyFromRegOp, 0);
19060 }
19061 } else {
19062 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19063 GA->getOffset(), OperandFlags);
19064 }
19065
19066 if (!Ret) {
19067 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19068 : LocalDynamic ? X86ISD::TLSBASEADDR
19070
19071 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19072 if (LoadGlobalBaseReg) {
19073 SDValue InGlue;
19074 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19075 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19076 InGlue);
19077 InGlue = Chain.getValue(1);
19078 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19079 } else {
19080 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19081 }
19082 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19083
19084 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19085 MFI.setHasCalls(true);
19086
19087 SDValue Glue = Chain.getValue(1);
19088 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19089 }
19090
19091 if (!UseTLSDESC)
19092 return Ret;
19093
19094 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19095 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19096
19098 SDValue Offset =
19099 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19101 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19102}
19103
19104// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19105static SDValue
19107 const EVT PtrVT) {
19108 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19109 /*LoadGlobalBaseReg=*/true);
19110}
19111
19112// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19113static SDValue
19115 const EVT PtrVT) {
19116 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19117}
19118
19119// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19120static SDValue
19122 const EVT PtrVT) {
19123 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19124}
19125
19127 SelectionDAG &DAG, const EVT PtrVT,
19128 bool Is64Bit, bool Is64BitLP64) {
19129 SDLoc dl(GA);
19130
19131 // Get the start address of the TLS block for this module.
19135
19136 SDValue Base;
19137 if (Is64Bit) {
19138 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19139 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19140 /*LoadGlobalBaseReg=*/false,
19141 /*LocalDynamic=*/true);
19142 } else {
19143 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19144 /*LoadGlobalBaseReg=*/true,
19145 /*LocalDynamic=*/true);
19146 }
19147
19148 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19149 // of Base.
19150
19151 // Build x@dtpoff.
19152 unsigned char OperandFlags = X86II::MO_DTPOFF;
19153 unsigned WrapperKind = X86ISD::Wrapper;
19154 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19155 GA->getValueType(0),
19156 GA->getOffset(), OperandFlags);
19157 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19158
19159 // Add x@dtpoff with the base.
19160 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19161}
19162
19163// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19165 const EVT PtrVT, TLSModel::Model model,
19166 bool is64Bit, bool isPIC) {
19167 SDLoc dl(GA);
19168
19169 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19172
19173 SDValue ThreadPointer =
19174 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19176
19177 unsigned char OperandFlags = 0;
19178 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19179 // initialexec.
19180 unsigned WrapperKind = X86ISD::Wrapper;
19181 if (model == TLSModel::LocalExec) {
19182 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19183 } else if (model == TLSModel::InitialExec) {
19184 if (is64Bit) {
19185 OperandFlags = X86II::MO_GOTTPOFF;
19186 WrapperKind = X86ISD::WrapperRIP;
19187 } else {
19188 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19189 }
19190 } else {
19191 llvm_unreachable("Unexpected model");
19192 }
19193
19194 // emit "addl x@ntpoff,%eax" (local exec)
19195 // or "addl x@indntpoff,%eax" (initial exec)
19196 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19197 SDValue TGA =
19198 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19199 GA->getOffset(), OperandFlags);
19200 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19201
19202 if (model == TLSModel::InitialExec) {
19203 if (isPIC && !is64Bit) {
19204 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19205 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19206 Offset);
19207 }
19208
19209 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19211 }
19212
19213 // The address of the thread local variable is the add of the thread
19214 // pointer with the offset of the variable.
19215 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19216}
19217
19218SDValue
19219X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19220
19221 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19222
19223 if (DAG.getTarget().useEmulatedTLS())
19224 return LowerToTLSEmulatedModel(GA, DAG);
19225
19226 const GlobalValue *GV = GA->getGlobal();
19227 auto PtrVT = getPointerTy(DAG.getDataLayout());
19228 bool PositionIndependent = isPositionIndependent();
19229
19230 if (Subtarget.isTargetELF()) {
19231 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19232 switch (model) {
19234 if (Subtarget.is64Bit()) {
19235 if (Subtarget.isTarget64BitLP64())
19236 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19237 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19238 }
19239 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19241 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19242 Subtarget.isTarget64BitLP64());
19245 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19246 PositionIndependent);
19247 }
19248 llvm_unreachable("Unknown TLS model.");
19249 }
19250
19251 if (Subtarget.isTargetDarwin()) {
19252 // Darwin only has one model of TLS. Lower to that.
19253 unsigned char OpFlag = 0;
19254 unsigned WrapperKind = 0;
19255
19256 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19257 // global base reg.
19258 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19259 if (PIC32) {
19260 OpFlag = X86II::MO_TLVP_PIC_BASE;
19261 WrapperKind = X86ISD::Wrapper;
19262 } else {
19263 OpFlag = X86II::MO_TLVP;
19264 WrapperKind = X86ISD::WrapperRIP;
19265 }
19266 SDLoc DL(Op);
19268 GA->getValueType(0),
19269 GA->getOffset(), OpFlag);
19270 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19271
19272 // With PIC32, the address is actually $g + Offset.
19273 if (PIC32)
19274 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19275 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19276 Offset);
19277
19278 // Lowering the machine isd will make sure everything is in the right
19279 // location.
19280 SDValue Chain = DAG.getEntryNode();
19281 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19282 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19283 SDValue Args[] = { Chain, Offset };
19284 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19285 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19286
19287 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19289 MFI.setAdjustsStack(true);
19290
19291 // And our return value (tls address) is in the standard call return value
19292 // location.
19293 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19294 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19295 }
19296
19297 if (Subtarget.isOSWindows()) {
19298 // Just use the implicit TLS architecture
19299 // Need to generate something similar to:
19300 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19301 // ; from TEB
19302 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19303 // mov rcx, qword [rdx+rcx*8]
19304 // mov eax, .tls$:tlsvar
19305 // [rax+rcx] contains the address
19306 // Windows 64bit: gs:0x58
19307 // Windows 32bit: fs:__tls_array
19308
19309 SDLoc dl(GA);
19310 SDValue Chain = DAG.getEntryNode();
19311
19312 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19313 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19314 // use its literal value of 0x2C.
19316 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19318
19319 SDValue TlsArray = Subtarget.is64Bit()
19320 ? DAG.getIntPtrConstant(0x58, dl)
19321 : (Subtarget.isTargetWindowsGNU()
19322 ? DAG.getIntPtrConstant(0x2C, dl)
19323 : DAG.getExternalSymbol("_tls_array", PtrVT));
19324
19326 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19327
19328 SDValue res;
19330 res = ThreadPointer;
19331 } else {
19332 // Load the _tls_index variable
19333 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19334 if (Subtarget.is64Bit())
19335 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19336 MachinePointerInfo(), MVT::i32);
19337 else
19338 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19339
19340 const DataLayout &DL = DAG.getDataLayout();
19341 SDValue Scale =
19342 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19343 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19344
19345 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19346 }
19347
19348 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19349
19350 // Get the offset of start of .tls section
19351 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19352 GA->getValueType(0),
19354 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19355
19356 // The address of the thread local variable is the add of the thread
19357 // pointer with the offset of the variable.
19358 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19359 }
19360
19361 llvm_unreachable("TLS not implemented for this target.");
19362}
19363
19365 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19366 const TargetMachine &TM = getTargetMachine();
19367 TLSModel::Model Model = TM.getTLSModel(&GV);
19368 switch (Model) {
19371 // We can include the %fs segment register in addressing modes.
19372 return true;
19375 // These models do not result in %fs relative addresses unless
19376 // TLS descriptior are used.
19377 //
19378 // Even in the case of TLS descriptors we currently have no way to model
19379 // the difference between %fs access and the computations needed for the
19380 // offset and returning `true` for TLS-desc currently duplicates both
19381 // which is detrimental :-/
19382 return false;
19383 }
19384 }
19385 return false;
19386}
19387
19388/// Lower SRA_PARTS and friends, which return two i32 values
19389/// and take a 2 x i32 value to shift plus a shift amount.
19390/// TODO: Can this be moved to general expansion code?
19392 SDValue Lo, Hi;
19393 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19394 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19395}
19396
19397// Try to use a packed vector operation to handle i64 on 32-bit targets when
19398// AVX512DQ is enabled.
19400 SelectionDAG &DAG,
19401 const X86Subtarget &Subtarget) {
19402 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19403 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19404 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19405 Op.getOpcode() == ISD::UINT_TO_FP) &&
19406 "Unexpected opcode!");
19407 bool IsStrict = Op->isStrictFPOpcode();
19408 unsigned OpNo = IsStrict ? 1 : 0;
19409 SDValue Src = Op.getOperand(OpNo);
19410 MVT SrcVT = Src.getSimpleValueType();
19411 MVT VT = Op.getSimpleValueType();
19412
19413 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19414 (VT != MVT::f32 && VT != MVT::f64))
19415 return SDValue();
19416
19417 // Pack the i64 into a vector, do the operation and extract.
19418
19419 // Using 256-bit to ensure result is 128-bits for f32 case.
19420 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19421 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19422 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19423
19424 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19425 if (IsStrict) {
19426 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19427 {Op.getOperand(0), InVec});
19428 SDValue Chain = CvtVec.getValue(1);
19429 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19430 DAG.getVectorIdxConstant(0, dl));
19431 return DAG.getMergeValues({Value, Chain}, dl);
19432 }
19433
19434 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19435
19436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19437 DAG.getVectorIdxConstant(0, dl));
19438}
19439
19440// Try to use a packed vector operation to handle i64 on 32-bit targets.
19442 const X86Subtarget &Subtarget) {
19443 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19444 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19445 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19446 Op.getOpcode() == ISD::UINT_TO_FP) &&
19447 "Unexpected opcode!");
19448 bool IsStrict = Op->isStrictFPOpcode();
19449 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19450 MVT SrcVT = Src.getSimpleValueType();
19451 MVT VT = Op.getSimpleValueType();
19452
19453 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19454 return SDValue();
19455
19456 // Pack the i64 into a vector, do the operation and extract.
19457
19458 assert(Subtarget.hasFP16() && "Expected FP16");
19459
19460 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19461 if (IsStrict) {
19462 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19463 {Op.getOperand(0), InVec});
19464 SDValue Chain = CvtVec.getValue(1);
19465 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19466 DAG.getVectorIdxConstant(0, dl));
19467 return DAG.getMergeValues({Value, Chain}, dl);
19468 }
19469
19470 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19471
19472 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19473 DAG.getVectorIdxConstant(0, dl));
19474}
19475
19476static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19477 const X86Subtarget &Subtarget) {
19478 switch (Opcode) {
19479 case ISD::SINT_TO_FP:
19480 // TODO: Handle wider types with AVX/AVX512.
19481 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19482 return false;
19483 // CVTDQ2PS or (V)CVTDQ2PD
19484 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19485
19486 case ISD::UINT_TO_FP:
19487 // TODO: Handle wider types and i64 elements.
19488 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19489 return false;
19490 // VCVTUDQ2PS or VCVTUDQ2PD
19491 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19492
19493 default:
19494 return false;
19495 }
19496}
19497
19498/// Given a scalar cast operation that is extracted from a vector, try to
19499/// vectorize the cast op followed by extraction. This will avoid an expensive
19500/// round-trip between XMM and GPR.
19502 SelectionDAG &DAG,
19503 const X86Subtarget &Subtarget) {
19504 // TODO: This could be enhanced to handle smaller integer types by peeking
19505 // through an extend.
19506 SDValue Extract = Cast.getOperand(0);
19507 MVT DestVT = Cast.getSimpleValueType();
19508 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19509 !isa<ConstantSDNode>(Extract.getOperand(1)))
19510 return SDValue();
19511
19512 // See if we have a 128-bit vector cast op for this type of cast.
19513 SDValue VecOp = Extract.getOperand(0);
19514 MVT FromVT = VecOp.getSimpleValueType();
19515 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19516 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19517 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19518 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19519 return SDValue();
19520
19521 // If we are extracting from a non-zero element, first shuffle the source
19522 // vector to allow extracting from element zero.
19523 if (!isNullConstant(Extract.getOperand(1))) {
19524 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19525 Mask[0] = Extract.getConstantOperandVal(1);
19526 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19527 }
19528 // If the source vector is wider than 128-bits, extract the low part. Do not
19529 // create an unnecessarily wide vector cast op.
19530 if (FromVT != Vec128VT)
19531 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19532
19533 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19534 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19535 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19536 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19537 DAG.getVectorIdxConstant(0, DL));
19538}
19539
19540/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19541/// try to vectorize the cast ops. This will avoid an expensive round-trip
19542/// between XMM and GPR.
19543static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19544 SelectionDAG &DAG,
19545 const X86Subtarget &Subtarget) {
19546 // TODO: Allow FP_TO_UINT.
19547 SDValue CastToInt = CastToFP.getOperand(0);
19548 MVT VT = CastToFP.getSimpleValueType();
19549 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19550 return SDValue();
19551
19552 MVT IntVT = CastToInt.getSimpleValueType();
19553 SDValue X = CastToInt.getOperand(0);
19554 MVT SrcVT = X.getSimpleValueType();
19555 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19556 return SDValue();
19557
19558 // See if we have 128-bit vector cast instructions for this type of cast.
19559 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19560 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19561 IntVT != MVT::i32)
19562 return SDValue();
19563
19564 unsigned SrcSize = SrcVT.getSizeInBits();
19565 unsigned IntSize = IntVT.getSizeInBits();
19566 unsigned VTSize = VT.getSizeInBits();
19567 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19568 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19569 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19570
19571 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19572 unsigned ToIntOpcode =
19573 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19574 unsigned ToFPOpcode =
19575 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19576
19577 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19578 //
19579 // We are not defining the high elements (for example, zero them) because
19580 // that could nullify any performance advantage that we hoped to gain from
19581 // this vector op hack. We do not expect any adverse effects (like denorm
19582 // penalties) with cast ops.
19583 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19584 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19585 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19586 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19587 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19588}
19589
19591 SelectionDAG &DAG,
19592 const X86Subtarget &Subtarget) {
19593 bool IsStrict = Op->isStrictFPOpcode();
19594 MVT VT = Op->getSimpleValueType(0);
19595 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19596
19597 if (Subtarget.hasDQI()) {
19598 assert(!Subtarget.hasVLX() && "Unexpected features");
19599
19600 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19601 Src.getSimpleValueType() == MVT::v4i64) &&
19602 "Unsupported custom type");
19603
19604 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19605 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19606 "Unexpected VT!");
19607 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19608
19609 // Need to concat with zero vector for strict fp to avoid spurious
19610 // exceptions.
19611 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19612 : DAG.getUNDEF(MVT::v8i64);
19613 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19614 DAG.getVectorIdxConstant(0, DL));
19615 SDValue Res, Chain;
19616 if (IsStrict) {
19617 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19618 {Op->getOperand(0), Src});
19619 Chain = Res.getValue(1);
19620 } else {
19621 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19622 }
19623
19624 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19625 DAG.getVectorIdxConstant(0, DL));
19626
19627 if (IsStrict)
19628 return DAG.getMergeValues({Res, Chain}, DL);
19629 return Res;
19630 }
19631
19632 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19633 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19634 if (VT != MVT::v4f32 || IsSigned)
19635 return SDValue();
19636
19637 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19638 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19639 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19640 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19641 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19642 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19643 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19644 SmallVector<SDValue, 4> SignCvts(4);
19645 SmallVector<SDValue, 4> Chains(4);
19646 for (int i = 0; i != 4; ++i) {
19647 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19648 DAG.getVectorIdxConstant(i, DL));
19649 if (IsStrict) {
19650 SignCvts[i] =
19651 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19652 {Op.getOperand(0), Elt});
19653 Chains[i] = SignCvts[i].getValue(1);
19654 } else {
19655 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19656 }
19657 }
19658 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19659
19660 SDValue Slow, Chain;
19661 if (IsStrict) {
19662 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19663 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19664 {Chain, SignCvt, SignCvt});
19665 Chain = Slow.getValue(1);
19666 } else {
19667 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19668 }
19669
19670 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19671 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19672
19673 if (IsStrict)
19674 return DAG.getMergeValues({Cvt, Chain}, DL);
19675
19676 return Cvt;
19677}
19678
19680 SelectionDAG &DAG) {
19681 bool IsStrict = Op->isStrictFPOpcode();
19682 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19683 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19684 MVT VT = Op.getSimpleValueType();
19685 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19686
19687 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
19688 if (IsStrict)
19689 return DAG.getNode(
19690 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19691 {Chain,
19692 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19693 Rnd});
19694 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19695 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19696}
19697
19698static bool isLegalConversion(MVT VT, bool IsSigned,
19699 const X86Subtarget &Subtarget) {
19700 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19701 return true;
19702 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19703 return true;
19704 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19705 return true;
19706 if (Subtarget.useAVX512Regs()) {
19707 if (VT == MVT::v16i32)
19708 return true;
19709 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19710 return true;
19711 }
19712 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19713 (VT == MVT::v2i64 || VT == MVT::v4i64))
19714 return true;
19715 return false;
19716}
19717
19718SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19719 SelectionDAG &DAG) const {
19720 bool IsStrict = Op->isStrictFPOpcode();
19721 unsigned OpNo = IsStrict ? 1 : 0;
19722 SDValue Src = Op.getOperand(OpNo);
19723 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19724 MVT SrcVT = Src.getSimpleValueType();
19725 MVT VT = Op.getSimpleValueType();
19726 SDLoc dl(Op);
19727
19728 if (isSoftF16(VT, Subtarget))
19729 return promoteXINT_TO_FP(Op, dl, DAG);
19730 else if (isLegalConversion(SrcVT, true, Subtarget))
19731 return Op;
19732
19733 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19734 return LowerWin64_INT128_TO_FP(Op, DAG);
19735
19736 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19737 return Extract;
19738
19739 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19740 return R;
19741
19742 if (SrcVT.isVector()) {
19743 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19744 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19745 // source for strict FP.
19746 if (IsStrict)
19747 return DAG.getNode(
19748 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19749 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19750 DAG.getUNDEF(SrcVT))});
19751 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19752 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19753 DAG.getUNDEF(SrcVT)));
19754 }
19755 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19756 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19757
19758 return SDValue();
19759 }
19760
19761 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19762 "Unknown SINT_TO_FP to lower!");
19763
19764 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19765
19766 // These are really Legal; return the operand so the caller accepts it as
19767 // Legal.
19768 if (SrcVT == MVT::i32 && UseSSEReg)
19769 return Op;
19770 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19771 return Op;
19772
19773 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19774 return V;
19775 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19776 return V;
19777
19778 // SSE doesn't have an i16 conversion so we need to promote.
19779 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19780 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19781 if (IsStrict)
19782 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19783 {Chain, Ext});
19784
19785 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19786 }
19787
19788 if (VT == MVT::f128 || !Subtarget.hasX87())
19789 return SDValue();
19790
19791 SDValue ValueToStore = Src;
19792 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19793 // Bitcasting to f64 here allows us to do a single 64-bit store from
19794 // an SSE register, avoiding the store forwarding penalty that would come
19795 // with two 32-bit stores.
19796 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19797
19798 unsigned Size = SrcVT.getStoreSize();
19799 Align Alignment(Size);
19801 auto PtrVT = getPointerTy(MF.getDataLayout());
19802 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19803 MachinePointerInfo MPI =
19805 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19806 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19807 std::pair<SDValue, SDValue> Tmp =
19808 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19809
19810 if (IsStrict)
19811 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19812
19813 return Tmp.first;
19814}
19815
19816std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19817 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19818 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19819 // Build the FILD
19820 SDVTList Tys;
19821 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19822 if (useSSE)
19823 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19824 else
19825 Tys = DAG.getVTList(DstVT, MVT::Other);
19826
19827 SDValue FILDOps[] = {Chain, Pointer};
19828 SDValue Result =
19829 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19830 Alignment, MachineMemOperand::MOLoad);
19831 Chain = Result.getValue(1);
19832
19833 if (useSSE) {
19835 unsigned SSFISize = DstVT.getStoreSize();
19836 int SSFI =
19837 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19838 auto PtrVT = getPointerTy(MF.getDataLayout());
19839 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19840 Tys = DAG.getVTList(MVT::Other);
19841 SDValue FSTOps[] = {Chain, Result, StackSlot};
19844 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19845
19846 Chain =
19847 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19848 Result = DAG.getLoad(
19849 DstVT, DL, Chain, StackSlot,
19851 Chain = Result.getValue(1);
19852 }
19853
19854 return { Result, Chain };
19855}
19856
19857/// Horizontal vector math instructions may be slower than normal math with
19858/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19859/// implementation, and likely shuffle complexity of the alternate sequence.
19860static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19861 const X86Subtarget &Subtarget) {
19862 bool IsOptimizingSize = DAG.shouldOptForSize();
19863 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19864 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19865}
19866
19867/// 64-bit unsigned integer to double expansion.
19869 SelectionDAG &DAG,
19870 const X86Subtarget &Subtarget) {
19871 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19872 // when converting 0 when rounding toward negative infinity. Caller will
19873 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19874 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19875 // This algorithm is not obvious. Here it is what we're trying to output:
19876 /*
19877 movq %rax, %xmm0
19878 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19879 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19880 #ifdef __SSE3__
19881 haddpd %xmm0, %xmm0
19882 #else
19883 pshufd $0x4e, %xmm0, %xmm1
19884 addpd %xmm1, %xmm0
19885 #endif
19886 */
19887
19888 LLVMContext *Context = DAG.getContext();
19889
19890 // Build some magic constants.
19891 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19892 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19893 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19894 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19895
19897 CV1.push_back(
19898 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19899 APInt(64, 0x4330000000000000ULL))));
19900 CV1.push_back(
19901 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19902 APInt(64, 0x4530000000000000ULL))));
19903 Constant *C1 = ConstantVector::get(CV1);
19904 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19905
19906 // Load the 64-bit value into an XMM register.
19907 SDValue XR1 =
19908 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19909 SDValue CLod0 = DAG.getLoad(
19910 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19912 SDValue Unpck1 =
19913 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19914
19915 SDValue CLod1 = DAG.getLoad(
19916 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19918 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19919 // TODO: Are there any fast-math-flags to propagate here?
19920 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19921 SDValue Result;
19922
19923 if (Subtarget.hasSSE3() &&
19924 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19925 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19926 } else {
19927 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19928 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19929 }
19930 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19931 DAG.getVectorIdxConstant(0, dl));
19932 return Result;
19933}
19934
19935/// 32-bit unsigned integer to float expansion.
19937 SelectionDAG &DAG,
19938 const X86Subtarget &Subtarget) {
19939 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19940 // FP constant to bias correct the final result.
19941 SDValue Bias = DAG.getConstantFP(
19942 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19943
19944 // Load the 32-bit value into an XMM register.
19945 SDValue Load =
19946 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19947
19948 // Zero out the upper parts of the register.
19949 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19950
19951 // Or the load with the bias.
19952 SDValue Or = DAG.getNode(
19953 ISD::OR, dl, MVT::v2i64,
19954 DAG.getBitcast(MVT::v2i64, Load),
19955 DAG.getBitcast(MVT::v2i64,
19956 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19957 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19958 DAG.getBitcast(MVT::v2f64, Or),
19959 DAG.getVectorIdxConstant(0, dl));
19960
19961 if (Op.getNode()->isStrictFPOpcode()) {
19962 // Subtract the bias.
19963 // TODO: Are there any fast-math-flags to propagate here?
19964 SDValue Chain = Op.getOperand(0);
19965 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19966 {Chain, Or, Bias});
19967
19968 if (Op.getValueType() == Sub.getValueType())
19969 return Sub;
19970
19971 // Handle final rounding.
19972 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19973 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19974
19975 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19976 }
19977
19978 // Subtract the bias.
19979 // TODO: Are there any fast-math-flags to propagate here?
19980 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19981
19982 // Handle final rounding.
19983 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19984}
19985
19987 SelectionDAG &DAG,
19988 const X86Subtarget &Subtarget) {
19989 if (Op.getSimpleValueType() != MVT::v2f64)
19990 return SDValue();
19991
19992 bool IsStrict = Op->isStrictFPOpcode();
19993
19994 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19995 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19996
19997 if (Subtarget.hasAVX512()) {
19998 if (!Subtarget.hasVLX()) {
19999 // Let generic type legalization widen this.
20000 if (!IsStrict)
20001 return SDValue();
20002 // Otherwise pad the integer input with 0s and widen the operation.
20003 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20004 DAG.getConstant(0, DL, MVT::v2i32));
20005 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20006 {Op.getOperand(0), N0});
20007 SDValue Chain = Res.getValue(1);
20008 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20009 DAG.getVectorIdxConstant(0, DL));
20010 return DAG.getMergeValues({Res, Chain}, DL);
20011 }
20012
20013 // Legalize to v4i32 type.
20014 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20015 DAG.getUNDEF(MVT::v2i32));
20016 if (IsStrict)
20017 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20018 {Op.getOperand(0), N0});
20019 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20020 }
20021
20022 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20023 // This gives us the floating point equivalent of 2^52 + the i32 integer
20024 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20025 // point leaving just our i32 integers in double format.
20026 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20027 SDValue VBias = DAG.getConstantFP(
20028 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20029 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20030 DAG.getBitcast(MVT::v2i64, VBias));
20031 Or = DAG.getBitcast(MVT::v2f64, Or);
20032
20033 if (IsStrict)
20034 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20035 {Op.getOperand(0), Or, VBias});
20036 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20037}
20038
20040 SelectionDAG &DAG,
20041 const X86Subtarget &Subtarget) {
20042 bool IsStrict = Op->isStrictFPOpcode();
20043 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20044 MVT VecIntVT = V.getSimpleValueType();
20045 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20046 "Unsupported custom type");
20047
20048 if (Subtarget.hasAVX512()) {
20049 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20050 assert(!Subtarget.hasVLX() && "Unexpected features");
20051 MVT VT = Op->getSimpleValueType(0);
20052
20053 // v8i32->v8f64 is legal with AVX512 so just return it.
20054 if (VT == MVT::v8f64)
20055 return Op;
20056
20057 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20058 "Unexpected VT!");
20059 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20060 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20061 // Need to concat with zero vector for strict fp to avoid spurious
20062 // exceptions.
20063 SDValue Tmp =
20064 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20065 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20066 DAG.getVectorIdxConstant(0, DL));
20067 SDValue Res, Chain;
20068 if (IsStrict) {
20069 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20070 {Op->getOperand(0), V});
20071 Chain = Res.getValue(1);
20072 } else {
20073 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20074 }
20075
20076 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20077 DAG.getVectorIdxConstant(0, DL));
20078
20079 if (IsStrict)
20080 return DAG.getMergeValues({Res, Chain}, DL);
20081 return Res;
20082 }
20083
20084 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20085 Op->getSimpleValueType(0) == MVT::v4f64) {
20086 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20087 Constant *Bias = ConstantFP::get(
20088 *DAG.getContext(),
20089 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20090 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20091 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20092 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20093 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20094 SDValue VBias = DAG.getMemIntrinsicNode(
20095 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20098
20099 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20100 DAG.getBitcast(MVT::v4i64, VBias));
20101 Or = DAG.getBitcast(MVT::v4f64, Or);
20102
20103 if (IsStrict)
20104 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20105 {Op.getOperand(0), Or, VBias});
20106 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20107 }
20108
20109 // The algorithm is the following:
20110 // #ifdef __SSE4_1__
20111 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20112 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20113 // (uint4) 0x53000000, 0xaa);
20114 // #else
20115 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20116 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20117 // #endif
20118 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20119 // return (float4) lo + fhi;
20120
20121 bool Is128 = VecIntVT == MVT::v4i32;
20122 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20123 // If we convert to something else than the supported type, e.g., to v4f64,
20124 // abort early.
20125 if (VecFloatVT != Op->getSimpleValueType(0))
20126 return SDValue();
20127
20128 // In the #idef/#else code, we have in common:
20129 // - The vector of constants:
20130 // -- 0x4b000000
20131 // -- 0x53000000
20132 // - A shift:
20133 // -- v >> 16
20134
20135 // Create the splat vector for 0x4b000000.
20136 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20137 // Create the splat vector for 0x53000000.
20138 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20139
20140 // Create the right shift.
20141 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20142 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20143
20144 SDValue Low, High;
20145 if (Subtarget.hasSSE41()) {
20146 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20147 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20148 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20149 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20150 // Low will be bitcasted right away, so do not bother bitcasting back to its
20151 // original type.
20152 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20153 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20154 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20155 // (uint4) 0x53000000, 0xaa);
20156 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20157 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20158 // High will be bitcasted right away, so do not bother bitcasting back to
20159 // its original type.
20160 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20161 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20162 } else {
20163 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20164 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20165 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20166 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20167
20168 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20169 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20170 }
20171
20172 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20173 SDValue VecCstFSub = DAG.getConstantFP(
20174 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20175
20176 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20177 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20178 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20179 // enabled. See PR24512.
20180 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20181 // TODO: Are there any fast-math-flags to propagate here?
20182 // (float4) lo;
20183 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20184 // return (float4) lo + fhi;
20185 if (IsStrict) {
20186 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20187 {Op.getOperand(0), HighBitcast, VecCstFSub});
20188 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20189 {FHigh.getValue(1), LowBitcast, FHigh});
20190 }
20191
20192 SDValue FHigh =
20193 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20194 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20195}
20196
20198 const X86Subtarget &Subtarget) {
20199 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20200 SDValue N0 = Op.getOperand(OpNo);
20201 MVT SrcVT = N0.getSimpleValueType();
20202
20203 switch (SrcVT.SimpleTy) {
20204 default:
20205 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20206 case MVT::v2i32:
20207 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20208 case MVT::v4i32:
20209 case MVT::v8i32:
20210 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20211 case MVT::v2i64:
20212 case MVT::v4i64:
20213 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20214 }
20215}
20216
20217SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20218 SelectionDAG &DAG) const {
20219 bool IsStrict = Op->isStrictFPOpcode();
20220 unsigned OpNo = IsStrict ? 1 : 0;
20221 SDValue Src = Op.getOperand(OpNo);
20222 SDLoc dl(Op);
20223 auto PtrVT = getPointerTy(DAG.getDataLayout());
20224 MVT SrcVT = Src.getSimpleValueType();
20225 MVT DstVT = Op->getSimpleValueType(0);
20226 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20227
20228 // Bail out when we don't have native conversion instructions.
20229 if (DstVT == MVT::f128)
20230 return SDValue();
20231
20232 if (isSoftF16(DstVT, Subtarget))
20233 return promoteXINT_TO_FP(Op, dl, DAG);
20234 else if (isLegalConversion(SrcVT, false, Subtarget))
20235 return Op;
20236
20237 if (DstVT.isVector())
20238 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20239
20240 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20241 return LowerWin64_INT128_TO_FP(Op, DAG);
20242
20243 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20244 return Extract;
20245
20246 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20247 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20248 // Conversions from unsigned i32 to f32/f64 are legal,
20249 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20250 return Op;
20251 }
20252
20253 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20254 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20255 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20256 if (IsStrict)
20257 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20258 {Chain, Src});
20259 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20260 }
20261
20262 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20263 return V;
20264 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20265 return V;
20266
20267 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20268 // infinity. It produces -0.0, so disable under strictfp.
20269 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20270 !IsStrict)
20271 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20272 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20273 // negative infinity. So disable under strictfp. Using FILD instead.
20274 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20275 !IsStrict)
20276 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20277 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20278 (DstVT == MVT::f32 || DstVT == MVT::f64))
20279 return SDValue();
20280
20281 // Make a 64-bit buffer, and use it to build an FILD.
20282 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20283 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20284 Align SlotAlign(8);
20285 MachinePointerInfo MPI =
20287 if (SrcVT == MVT::i32) {
20288 SDValue OffsetSlot =
20289 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20290 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20291 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20292 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20293 std::pair<SDValue, SDValue> Tmp =
20294 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20295 if (IsStrict)
20296 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20297
20298 return Tmp.first;
20299 }
20300
20301 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20302 SDValue ValueToStore = Src;
20303 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20304 // Bitcasting to f64 here allows us to do a single 64-bit store from
20305 // an SSE register, avoiding the store forwarding penalty that would come
20306 // with two 32-bit stores.
20307 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20308 }
20309 SDValue Store =
20310 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20311 // For i64 source, we need to add the appropriate power of 2 if the input
20312 // was negative. We must be careful to do the computation in x87 extended
20313 // precision, not in SSE.
20314 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20315 SDValue Ops[] = {Store, StackSlot};
20316 SDValue Fild =
20317 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20318 SlotAlign, MachineMemOperand::MOLoad);
20319 Chain = Fild.getValue(1);
20320
20321 // Check whether the sign bit is set.
20322 SDValue SignSet = DAG.getSetCC(
20323 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20324 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20325
20326 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20327 APInt FF(64, 0x5F80000000000000ULL);
20328 SDValue FudgePtr =
20329 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20330 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20331
20332 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20333 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20334 SDValue Four = DAG.getIntPtrConstant(4, dl);
20335 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20336 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20337
20338 // Load the value out, extending it from f32 to f80.
20339 SDValue Fudge = DAG.getExtLoad(
20340 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20342 CPAlignment);
20343 Chain = Fudge.getValue(1);
20344 // Extend everything to 80 bits to force it to be done on x87.
20345 // TODO: Are there any fast-math-flags to propagate here?
20346 if (IsStrict) {
20347 unsigned Opc = ISD::STRICT_FADD;
20348 // Windows needs the precision control changed to 80bits around this add.
20349 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20351
20352 SDValue Add =
20353 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20354 // STRICT_FP_ROUND can't handle equal types.
20355 if (DstVT == MVT::f80)
20356 return Add;
20357 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20358 {Add.getValue(1), Add,
20359 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20360 }
20361 unsigned Opc = ISD::FADD;
20362 // Windows needs the precision control changed to 80bits around this add.
20363 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20364 Opc = X86ISD::FP80_ADD;
20365
20366 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20367 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20368 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20369}
20370
20371// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20372// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20373// just return an SDValue().
20374// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20375// to i16, i32 or i64, and we lower it to a legal sequence and return the
20376// result.
20377SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20378 bool IsSigned,
20379 SDValue &Chain) const {
20380 bool IsStrict = Op->isStrictFPOpcode();
20381 SDLoc DL(Op);
20382
20383 EVT DstTy = Op.getValueType();
20384 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20385 EVT TheVT = Value.getValueType();
20386 auto PtrVT = getPointerTy(DAG.getDataLayout());
20387
20388 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20389 // f16 must be promoted before using the lowering in this routine.
20390 // fp128 does not use this lowering.
20391 return SDValue();
20392 }
20393
20394 // If using FIST to compute an unsigned i64, we'll need some fixup
20395 // to handle values above the maximum signed i64. A FIST is always
20396 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20397 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20398
20399 // FIXME: This does not generate an invalid exception if the input does not
20400 // fit in i32. PR44019
20401 if (!IsSigned && DstTy != MVT::i64) {
20402 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20403 // The low 32 bits of the fist result will have the correct uint32 result.
20404 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20405 DstTy = MVT::i64;
20406 }
20407
20408 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20409 DstTy.getSimpleVT() >= MVT::i16 &&
20410 "Unknown FP_TO_INT to lower!");
20411
20412 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20413 // stack slot.
20415 unsigned MemSize = DstTy.getStoreSize();
20416 int SSFI =
20417 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20418 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20419
20420 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20421
20422 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20423
20424 if (UnsignedFixup) {
20425 //
20426 // Conversion to unsigned i64 is implemented with a select,
20427 // depending on whether the source value fits in the range
20428 // of a signed i64. Let Thresh be the FP equivalent of
20429 // 0x8000000000000000ULL.
20430 //
20431 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20432 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20433 // FistSrc = (Value - FltOfs);
20434 // Fist-to-mem64 FistSrc
20435 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20436 // to XOR'ing the high 32 bits with Adjust.
20437 //
20438 // Being a power of 2, Thresh is exactly representable in all FP formats.
20439 // For X87 we'd like to use the smallest FP type for this constant, but
20440 // for DAG type consistency we have to match the FP operand type.
20441
20442 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20444 bool LosesInfo = false;
20445 if (TheVT == MVT::f64)
20446 // The rounding mode is irrelevant as the conversion should be exact.
20448 &LosesInfo);
20449 else if (TheVT == MVT::f80)
20450 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20451 APFloat::rmNearestTiesToEven, &LosesInfo);
20452
20453 assert(Status == APFloat::opOK && !LosesInfo &&
20454 "FP conversion should have been exact");
20455
20456 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20457
20458 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20459 *DAG.getContext(), TheVT);
20460 SDValue Cmp;
20461 if (IsStrict) {
20462 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20463 /*IsSignaling*/ true);
20464 Chain = Cmp.getValue(1);
20465 } else {
20466 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20467 }
20468
20469 // Our preferred lowering of
20470 //
20471 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20472 //
20473 // is
20474 //
20475 // (Value >= Thresh) << 63
20476 //
20477 // but since we can get here after LegalOperations, DAGCombine might do the
20478 // wrong thing if we create a select. So, directly create the preferred
20479 // version.
20480 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20481 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20482 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20483
20484 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20485 DAG.getConstantFP(0.0, DL, TheVT));
20486
20487 if (IsStrict) {
20488 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20489 { Chain, Value, FltOfs });
20490 Chain = Value.getValue(1);
20491 } else
20492 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20493 }
20494
20496
20497 // FIXME This causes a redundant load/store if the SSE-class value is already
20498 // in memory, such as if it is on the callstack.
20499 if (isScalarFPTypeInSSEReg(TheVT)) {
20500 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20501 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20502 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20503 SDValue Ops[] = { Chain, StackSlot };
20504
20505 unsigned FLDSize = TheVT.getStoreSize();
20506 assert(FLDSize <= MemSize && "Stack slot not big enough");
20508 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20509 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20510 Chain = Value.getValue(1);
20511 }
20512
20513 // Build the FP_TO_INT*_IN_MEM
20515 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20516 SDValue Ops[] = { Chain, Value, StackSlot };
20518 DAG.getVTList(MVT::Other),
20519 Ops, DstTy, MMO);
20520
20521 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20522 Chain = Res.getValue(1);
20523
20524 // If we need an unsigned fixup, XOR the result with adjust.
20525 if (UnsignedFixup)
20526 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20527
20528 return Res;
20529}
20530
20532 const X86Subtarget &Subtarget) {
20533 MVT VT = Op.getSimpleValueType();
20534 SDValue In = Op.getOperand(0);
20535 MVT InVT = In.getSimpleValueType();
20536 unsigned Opc = Op.getOpcode();
20537
20538 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20539 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20540 "Unexpected extension opcode");
20542 "Expected same number of elements");
20543 assert((VT.getVectorElementType() == MVT::i16 ||
20544 VT.getVectorElementType() == MVT::i32 ||
20545 VT.getVectorElementType() == MVT::i64) &&
20546 "Unexpected element type");
20547 assert((InVT.getVectorElementType() == MVT::i8 ||
20548 InVT.getVectorElementType() == MVT::i16 ||
20549 InVT.getVectorElementType() == MVT::i32) &&
20550 "Unexpected element type");
20551
20552 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20553
20554 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20555 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20556 return splitVectorIntUnary(Op, DAG, dl);
20557 }
20558
20559 if (Subtarget.hasInt256())
20560 return Op;
20561
20562 // Optimize vectors in AVX mode:
20563 //
20564 // v8i16 -> v8i32
20565 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20566 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20567 // Concat upper and lower parts.
20568 //
20569 // v4i32 -> v4i64
20570 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20571 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20572 // Concat upper and lower parts.
20573 //
20574 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20575 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20576
20577 // Short-circuit if we can determine that each 128-bit half is the same value.
20578 // Otherwise, this is difficult to match and optimize.
20579 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20580 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20581 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20582
20583 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20584 SDValue Undef = DAG.getUNDEF(InVT);
20585 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20586 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20587 OpHi = DAG.getBitcast(HalfVT, OpHi);
20588
20589 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20590}
20591
20592// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20593static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20594 const SDLoc &dl, SelectionDAG &DAG) {
20595 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20596 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20597 DAG.getVectorIdxConstant(0, dl));
20598 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20599 DAG.getVectorIdxConstant(8, dl));
20600 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20601 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20602 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20603 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20604}
20605
20607 const X86Subtarget &Subtarget,
20608 SelectionDAG &DAG) {
20609 MVT VT = Op->getSimpleValueType(0);
20610 SDValue In = Op->getOperand(0);
20611 MVT InVT = In.getSimpleValueType();
20612 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20613 unsigned NumElts = VT.getVectorNumElements();
20614
20615 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20616 // avoids a constant pool load.
20617 if (VT.getVectorElementType() != MVT::i8) {
20618 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20619 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20620 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20621 }
20622
20623 // Extend VT if BWI is not supported.
20624 MVT ExtVT = VT;
20625 if (!Subtarget.hasBWI()) {
20626 // If v16i32 is to be avoided, we'll need to split and concatenate.
20627 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20628 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20629
20630 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20631 }
20632
20633 // Widen to 512-bits if VLX is not supported.
20634 MVT WideVT = ExtVT;
20635 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20636 NumElts *= 512 / ExtVT.getSizeInBits();
20637 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20638 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20639 DAG.getVectorIdxConstant(0, DL));
20640 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20641 }
20642
20643 SDValue One = DAG.getConstant(1, DL, WideVT);
20644 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20645
20646 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20647
20648 // Truncate if we had to extend above.
20649 if (VT != ExtVT) {
20650 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20651 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20652 }
20653
20654 // Extract back to 128/256-bit if we widened.
20655 if (WideVT != VT)
20656 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20657 DAG.getVectorIdxConstant(0, DL));
20658
20659 return SelectedVal;
20660}
20661
20663 SelectionDAG &DAG) {
20664 SDValue In = Op.getOperand(0);
20665 MVT SVT = In.getSimpleValueType();
20666 SDLoc DL(Op);
20667
20668 if (SVT.getVectorElementType() == MVT::i1)
20669 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
20670
20671 assert(Subtarget.hasAVX() && "Expected AVX support");
20672 return LowerAVXExtend(Op, DL, DAG, Subtarget);
20673}
20674
20675/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20676/// It makes use of the fact that vectors with enough leading sign/zero bits
20677/// prevent the PACKSS/PACKUS from saturating the results.
20678/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20679/// within each 128-bit lane.
20680static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20681 const SDLoc &DL, SelectionDAG &DAG,
20682 const X86Subtarget &Subtarget) {
20683 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20684 "Unexpected PACK opcode");
20685 assert(DstVT.isVector() && "VT not a vector?");
20686
20687 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20688 if (!Subtarget.hasSSE2())
20689 return SDValue();
20690
20691 EVT SrcVT = In.getValueType();
20692
20693 // No truncation required, we might get here due to recursive calls.
20694 if (SrcVT == DstVT)
20695 return In;
20696
20697 unsigned NumElems = SrcVT.getVectorNumElements();
20698 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20699 return SDValue();
20700
20701 unsigned DstSizeInBits = DstVT.getSizeInBits();
20702 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20703 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20704 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20705
20706 LLVMContext &Ctx = *DAG.getContext();
20707 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20708 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20709
20710 // Pack to the largest type possible:
20711 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20712 EVT InVT = MVT::i16, OutVT = MVT::i8;
20713 if (SrcVT.getScalarSizeInBits() > 16 &&
20714 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20715 InVT = MVT::i32;
20716 OutVT = MVT::i16;
20717 }
20718
20719 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20720 // On pre-AVX512, pack the src in both halves to help value tracking.
20721 if (SrcSizeInBits <= 128) {
20722 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20723 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20724 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20725 SDValue LHS = DAG.getBitcast(InVT, In);
20726 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20727 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20728 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20729 Res = DAG.getBitcast(PackedVT, Res);
20730 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20731 }
20732
20733 // Split lower/upper subvectors.
20734 SDValue Lo, Hi;
20735 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20736
20737 // If Hi is undef, then don't bother packing it and widen the result instead.
20738 if (Hi.isUndef()) {
20739 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20740 if (SDValue Res =
20741 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20742 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20743 }
20744
20745 unsigned SubSizeInBits = SrcSizeInBits / 2;
20746 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20747 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20748
20749 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20750 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20751 Lo = DAG.getBitcast(InVT, Lo);
20752 Hi = DAG.getBitcast(InVT, Hi);
20753 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20754 return DAG.getBitcast(DstVT, Res);
20755 }
20756
20757 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20758 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20759 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20760 Lo = DAG.getBitcast(InVT, Lo);
20761 Hi = DAG.getBitcast(InVT, Hi);
20762 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20763
20764 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20765 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20766 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20768 int Scale = 64 / OutVT.getScalarSizeInBits();
20769 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20770 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20771
20772 if (DstVT.is256BitVector())
20773 return DAG.getBitcast(DstVT, Res);
20774
20775 // If 512bit -> 128bit truncate another stage.
20776 Res = DAG.getBitcast(PackedVT, Res);
20777 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20778 }
20779
20780 // Recursively pack lower/upper subvectors, concat result and pack again.
20781 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20782
20783 if (PackedVT.is128BitVector()) {
20784 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20785 // type legalization.
20786 SDValue Res =
20787 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20788 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20789 }
20790
20791 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20792 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20793 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20794 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20795 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20796}
20797
20798/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20799/// e.g. trunc <8 x i32> X to <8 x i16> -->
20800/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20801/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20803 const X86Subtarget &Subtarget,
20804 SelectionDAG &DAG) {
20805 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20806 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20807}
20808
20809/// Truncate using inreg sign extension and X86ISD::PACKSS.
20811 const X86Subtarget &Subtarget,
20812 SelectionDAG &DAG) {
20813 EVT SrcVT = In.getValueType();
20814 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20815 DAG.getValueType(DstVT));
20816 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20817}
20818
20819/// Helper to determine if \p In truncated to \p DstVT has the necessary
20820/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20821/// possibly by converting a SRL node to SRA for sign extension.
20822static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20823 SDValue In, const SDLoc &DL,
20824 SelectionDAG &DAG,
20825 const X86Subtarget &Subtarget) {
20826 // Requires SSE2.
20827 if (!Subtarget.hasSSE2())
20828 return SDValue();
20829
20830 EVT SrcVT = In.getValueType();
20831 EVT DstSVT = DstVT.getVectorElementType();
20832 EVT SrcSVT = SrcVT.getVectorElementType();
20833 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20834 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20835
20836 // Check we have a truncation suited for PACKSS/PACKUS.
20837 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20838 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20839 return SDValue();
20840
20841 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20842 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20843
20844 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20845 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20846 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20847 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20848 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20849 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20850 return SDValue();
20851
20852 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20853 // split this for packing.
20854 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20855 !isFreeToSplitVector(In.getNode(), DAG) &&
20856 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20857 return SDValue();
20858
20859 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20860 if (Subtarget.hasAVX512() && NumStages > 1)
20861 return SDValue();
20862
20863 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20864 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20865
20866 // Truncate with PACKUS if we are truncating a vector with leading zero
20867 // bits that extend all the way to the packed/truncated value.
20868 // e.g. Masks, zext_in_reg, etc.
20869 // Pre-SSE41 we can only use PACKUSWB.
20870 KnownBits Known = DAG.computeKnownBits(In);
20871 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20872 PackOpcode = X86ISD::PACKUS;
20873 return In;
20874 }
20875
20876 // Truncate with PACKSS if we are truncating a vector with sign-bits
20877 // that extend all the way to the packed/truncated value.
20878 // e.g. Comparison result, sext_in_reg, etc.
20879 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20880
20881 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20882 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20883 // see through BITCASTs later on and combines/simplifications can't then use
20884 // it.
20885 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20886 !Subtarget.hasAVX512())
20887 return SDValue();
20888
20889 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20890 if (MinSignBits < NumSignBits) {
20891 PackOpcode = X86ISD::PACKSS;
20892 return In;
20893 }
20894
20895 // If we have a srl that only generates signbits that we will discard in
20896 // the truncation then we can use PACKSS by converting the srl to a sra.
20897 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20898 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20899 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
20900 if (*ShAmt == MinSignBits) {
20901 PackOpcode = X86ISD::PACKSS;
20902 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20903 }
20904 }
20905
20906 return SDValue();
20907}
20908
20909/// This function lowers a vector truncation of 'extended sign-bits' or
20910/// 'extended zero-bits' values.
20911/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20913 const SDLoc &DL,
20914 const X86Subtarget &Subtarget,
20915 SelectionDAG &DAG) {
20916 MVT SrcVT = In.getSimpleValueType();
20917 MVT DstSVT = DstVT.getVectorElementType();
20918 MVT SrcSVT = SrcVT.getVectorElementType();
20919 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20920 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20921 return SDValue();
20922
20923 // If the upper half of the source is undef, then attempt to split and
20924 // only truncate the lower half.
20925 if (DstVT.getSizeInBits() >= 128) {
20926 SmallVector<SDValue> LowerOps;
20927 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20928 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20929 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20930 Subtarget, DAG))
20931 return widenSubVector(Res, false, Subtarget, DAG, DL,
20932 DstVT.getSizeInBits());
20933 }
20934 }
20935
20936 unsigned PackOpcode;
20937 if (SDValue Src =
20938 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20939 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20940
20941 return SDValue();
20942}
20943
20944/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20945/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20947 const X86Subtarget &Subtarget,
20948 SelectionDAG &DAG) {
20949 MVT SrcVT = In.getSimpleValueType();
20950 MVT DstSVT = DstVT.getVectorElementType();
20951 MVT SrcSVT = SrcVT.getVectorElementType();
20952 unsigned NumElems = DstVT.getVectorNumElements();
20953 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20954 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20955 NumElems >= 8))
20956 return SDValue();
20957
20958 // SSSE3's pshufb results in less instructions in the cases below.
20959 if (Subtarget.hasSSSE3() && NumElems == 8) {
20960 if (SrcSVT == MVT::i16)
20961 return SDValue();
20962 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20963 return SDValue();
20964 }
20965
20966 // If the upper half of the source is undef, then attempt to split and
20967 // only truncate the lower half.
20968 if (DstVT.getSizeInBits() >= 128) {
20969 SmallVector<SDValue> LowerOps;
20970 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20971 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20972 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20973 return widenSubVector(Res, false, Subtarget, DAG, DL,
20974 DstVT.getSizeInBits());
20975 }
20976 }
20977
20978 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20979 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20980 // truncate 2 x v4i32 to v8i16.
20981 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20982 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20983
20984 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20985 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20986
20987 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20988 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20989 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20990 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20991 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20992 }
20993
20994 return SDValue();
20995}
20996
20998 SelectionDAG &DAG,
20999 const X86Subtarget &Subtarget) {
21000 MVT VT = Op.getSimpleValueType();
21001 SDValue In = Op.getOperand(0);
21002 MVT InVT = In.getSimpleValueType();
21003 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21004
21005 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21006 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21007 if (InVT.getScalarSizeInBits() <= 16) {
21008 if (Subtarget.hasBWI()) {
21009 // legal, will go to VPMOVB2M, VPMOVW2M
21010 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21011 // We need to shift to get the lsb into sign position.
21012 // Shift packed bytes not supported natively, bitcast to word
21013 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21014 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21015 DAG.getBitcast(ExtVT, In),
21016 DAG.getConstant(ShiftInx, DL, ExtVT));
21017 In = DAG.getBitcast(InVT, In);
21018 }
21019 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21020 In, ISD::SETGT);
21021 }
21022 // Use TESTD/Q, extended vector to packed dword/qword.
21023 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21024 "Unexpected vector type.");
21025 unsigned NumElts = InVT.getVectorNumElements();
21026 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21027 // We need to change to a wider element type that we have support for.
21028 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21029 // For 16 element vectors we extend to v16i32 unless we are explicitly
21030 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21031 // we need to split into two 8 element vectors which we can extend to v8i32,
21032 // truncate and concat the results. There's an additional complication if
21033 // the original type is v16i8. In that case we can't split the v16i8
21034 // directly, so we need to shuffle high elements to low and use
21035 // sign_extend_vector_inreg.
21036 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21037 SDValue Lo, Hi;
21038 if (InVT == MVT::v16i8) {
21039 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21040 Hi = DAG.getVectorShuffle(
21041 InVT, DL, In, In,
21042 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21043 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21044 } else {
21045 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21046 Lo = extract128BitVector(In, 0, DAG, DL);
21047 Hi = extract128BitVector(In, 8, DAG, DL);
21048 }
21049 // We're split now, just emit two truncates and a concat. The two
21050 // truncates will trigger legalization to come back to this function.
21051 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21052 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21053 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21054 }
21055 // We either have 8 elements or we're allowed to use 512-bit vectors.
21056 // If we have VLX, we want to use the narrowest vector that can get the
21057 // job done so we use vXi32.
21058 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21059 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21060 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21061 InVT = ExtVT;
21062 ShiftInx = InVT.getScalarSizeInBits() - 1;
21063 }
21064
21065 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21066 // We need to shift to get the lsb into sign position.
21067 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21068 DAG.getConstant(ShiftInx, DL, InVT));
21069 }
21070 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21071 if (Subtarget.hasDQI())
21072 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21073 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21074}
21075
21076SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21077 SDLoc DL(Op);
21078 MVT VT = Op.getSimpleValueType();
21079 SDValue In = Op.getOperand(0);
21080 MVT InVT = In.getSimpleValueType();
21082 "Invalid TRUNCATE operation");
21083
21084 // If we're called by the type legalizer, handle a few cases.
21085 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21086 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21087 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21088 VT.is128BitVector() && Subtarget.hasAVX512()) {
21089 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21090 "Unexpected subtarget!");
21091 // The default behavior is to truncate one step, concatenate, and then
21092 // truncate the remainder. We'd rather produce two 64-bit results and
21093 // concatenate those.
21094 SDValue Lo, Hi;
21095 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21096
21097 EVT LoVT, HiVT;
21098 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21099
21100 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21101 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21102 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21103 }
21104
21105 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21106 if (!Subtarget.hasAVX512() ||
21107 (InVT.is512BitVector() && VT.is256BitVector()))
21108 if (SDValue SignPack =
21109 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21110 return SignPack;
21111
21112 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21113 if (!Subtarget.hasAVX512())
21114 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21115
21116 // Otherwise let default legalization handle it.
21117 return SDValue();
21118 }
21119
21120 if (VT.getVectorElementType() == MVT::i1)
21121 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21122
21123 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21124 // concat from subvectors to use VPTRUNC etc.
21125 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
21126 if (SDValue SignPack =
21127 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21128 return SignPack;
21129
21130 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21131 if (Subtarget.hasAVX512()) {
21132 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21133 assert(VT == MVT::v32i8 && "Unexpected VT!");
21134 return splitVectorIntUnary(Op, DAG, DL);
21135 }
21136
21137 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21138 // and then truncate that. But we should only do that if we haven't been
21139 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21140 // handled by isel patterns.
21141 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21142 Subtarget.canExtendTo512DQ())
21143 return Op;
21144 }
21145
21146 // Handle truncation of V256 to V128 using shuffles.
21147 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21148
21149 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21150 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21151 if (Subtarget.hasInt256()) {
21152 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21153 In = DAG.getBitcast(MVT::v8i32, In);
21154 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21155 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21156 DAG.getVectorIdxConstant(0, DL));
21157 }
21158
21159 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21160 DAG.getVectorIdxConstant(0, DL));
21161 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21162 DAG.getVectorIdxConstant(2, DL));
21163 static const int ShufMask[] = {0, 2, 4, 6};
21164 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21165 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21166 }
21167
21168 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21169 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21170 if (Subtarget.hasInt256()) {
21171 // The PSHUFB mask:
21172 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21173 -1, -1, -1, -1, -1, -1, -1, -1,
21174 16, 17, 20, 21, 24, 25, 28, 29,
21175 -1, -1, -1, -1, -1, -1, -1, -1 };
21176 In = DAG.getBitcast(MVT::v32i8, In);
21177 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21178 In = DAG.getBitcast(MVT::v4i64, In);
21179
21180 static const int ShufMask2[] = {0, 2, -1, -1};
21181 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21182 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21183 DAG.getVectorIdxConstant(0, DL));
21184 return DAG.getBitcast(MVT::v8i16, In);
21185 }
21186
21187 return Subtarget.hasSSE41()
21188 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21189 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21190 }
21191
21192 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21193 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21194
21195 llvm_unreachable("All 256->128 cases should have been handled above!");
21196}
21197
21198// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21199// behaves on out of range inputs to generate optimized conversions.
21201 SelectionDAG &DAG,
21202 const X86Subtarget &Subtarget) {
21203 MVT SrcVT = Src.getSimpleValueType();
21204 unsigned DstBits = VT.getScalarSizeInBits();
21205 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21206
21207 // Calculate the converted result for values in the range 0 to
21208 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21209 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21210 SDValue Big =
21211 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21212 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21213 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21214
21215 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21216 // and only if the value was out of range. So we can use that
21217 // as our indicator that we rather use "Big" instead of "Small".
21218 //
21219 // Use "Small" if "IsOverflown" has all bits cleared
21220 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21221
21222 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21223 // use the slightly slower blendv select instead.
21224 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21225 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21226 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21227 }
21228
21229 SDValue IsOverflown =
21230 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21231 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21232 return DAG.getNode(ISD::OR, dl, VT, Small,
21233 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21234}
21235
21236SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21237 bool IsStrict = Op->isStrictFPOpcode();
21238 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21239 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21240 MVT VT = Op->getSimpleValueType(0);
21241 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21242 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21243 MVT SrcVT = Src.getSimpleValueType();
21244 SDLoc dl(Op);
21245
21246 SDValue Res;
21247 if (isSoftF16(SrcVT, Subtarget)) {
21248 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21249 if (IsStrict)
21250 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21251 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21252 {NVT, MVT::Other}, {Chain, Src})});
21253 return DAG.getNode(Op.getOpcode(), dl, VT,
21254 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21255 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
21256 return Op;
21257 }
21258
21259 if (VT.isVector()) {
21260 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21261 MVT ResVT = MVT::v4i32;
21262 MVT TruncVT = MVT::v4i1;
21263 unsigned Opc;
21264 if (IsStrict)
21266 else
21267 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21268
21269 if (!IsSigned && !Subtarget.hasVLX()) {
21270 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21271 // Widen to 512-bits.
21272 ResVT = MVT::v8i32;
21273 TruncVT = MVT::v8i1;
21274 Opc = Op.getOpcode();
21275 // Need to concat with zero vector for strict fp to avoid spurious
21276 // exceptions.
21277 // TODO: Should we just do this for non-strict as well?
21278 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21279 : DAG.getUNDEF(MVT::v8f64);
21280 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21281 DAG.getVectorIdxConstant(0, dl));
21282 }
21283 if (IsStrict) {
21284 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21285 Chain = Res.getValue(1);
21286 } else {
21287 Res = DAG.getNode(Opc, dl, ResVT, Src);
21288 }
21289
21290 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21291 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21292 DAG.getVectorIdxConstant(0, dl));
21293 if (IsStrict)
21294 return DAG.getMergeValues({Res, Chain}, dl);
21295 return Res;
21296 }
21297
21298 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21299 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21300 return Op;
21301
21302 MVT ResVT = VT;
21303 MVT EleVT = VT.getVectorElementType();
21304 if (EleVT != MVT::i64)
21305 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21306
21307 if (SrcVT != MVT::v8f16) {
21308 SDValue Tmp =
21309 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21310 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21311 Ops[0] = Src;
21312 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21313 }
21314
21315 if (IsStrict) {
21316 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21318 dl, {ResVT, MVT::Other}, {Chain, Src});
21319 Chain = Res.getValue(1);
21320 } else {
21321 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21322 ResVT, Src);
21323 }
21324
21325 // TODO: Need to add exception check code for strict FP.
21326 if (EleVT.getSizeInBits() < 16) {
21327 ResVT = MVT::getVectorVT(EleVT, 8);
21328 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21329 }
21330
21331 if (ResVT != VT)
21332 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21333 DAG.getVectorIdxConstant(0, dl));
21334
21335 if (IsStrict)
21336 return DAG.getMergeValues({Res, Chain}, dl);
21337 return Res;
21338 }
21339
21340 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21341 if (VT.getVectorElementType() == MVT::i16) {
21342 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21343 SrcVT.getVectorElementType() == MVT::f64) &&
21344 "Expected f32/f64 vector!");
21345 MVT NVT = VT.changeVectorElementType(MVT::i32);
21346 if (IsStrict) {
21347 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21349 dl, {NVT, MVT::Other}, {Chain, Src});
21350 Chain = Res.getValue(1);
21351 } else {
21352 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21353 NVT, Src);
21354 }
21355
21356 // TODO: Need to add exception check code for strict FP.
21357 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21358
21359 if (IsStrict)
21360 return DAG.getMergeValues({Res, Chain}, dl);
21361 return Res;
21362 }
21363
21364 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21365 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21366 assert(!IsSigned && "Expected unsigned conversion!");
21367 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21368 return Op;
21369 }
21370
21371 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21372 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21373 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21374 Subtarget.useAVX512Regs()) {
21375 assert(!IsSigned && "Expected unsigned conversion!");
21376 assert(!Subtarget.hasVLX() && "Unexpected features!");
21377 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21378 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21379 // Need to concat with zero vector for strict fp to avoid spurious
21380 // exceptions.
21381 // TODO: Should we just do this for non-strict as well?
21382 SDValue Tmp =
21383 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21384 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21385 DAG.getVectorIdxConstant(0, dl));
21386
21387 if (IsStrict) {
21388 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21389 {Chain, Src});
21390 Chain = Res.getValue(1);
21391 } else {
21392 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21393 }
21394
21395 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21396 DAG.getVectorIdxConstant(0, dl));
21397
21398 if (IsStrict)
21399 return DAG.getMergeValues({Res, Chain}, dl);
21400 return Res;
21401 }
21402
21403 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21404 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21405 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21406 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21407 assert(!Subtarget.hasVLX() && "Unexpected features!");
21408 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21409 // Need to concat with zero vector for strict fp to avoid spurious
21410 // exceptions.
21411 // TODO: Should we just do this for non-strict as well?
21412 SDValue Tmp =
21413 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21414 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21415 DAG.getVectorIdxConstant(0, dl));
21416
21417 if (IsStrict) {
21418 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21419 {Chain, Src});
21420 Chain = Res.getValue(1);
21421 } else {
21422 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21423 }
21424
21425 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21426 DAG.getVectorIdxConstant(0, dl));
21427
21428 if (IsStrict)
21429 return DAG.getMergeValues({Res, Chain}, dl);
21430 return Res;
21431 }
21432
21433 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21434 if (!Subtarget.hasVLX()) {
21435 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21436 // legalizer and then widened again by vector op legalization.
21437 if (!IsStrict)
21438 return SDValue();
21439
21440 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21441 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21442 {Src, Zero, Zero, Zero});
21443 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21444 {Chain, Tmp});
21445 SDValue Chain = Tmp.getValue(1);
21446 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21447 DAG.getVectorIdxConstant(0, dl));
21448 return DAG.getMergeValues({Tmp, Chain}, dl);
21449 }
21450
21451 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21452 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21453 DAG.getUNDEF(MVT::v2f32));
21454 if (IsStrict) {
21455 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21457 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21458 }
21459 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21460 return DAG.getNode(Opc, dl, VT, Tmp);
21461 }
21462
21463 // Generate optimized instructions for pre AVX512 unsigned conversions from
21464 // vXf32 to vXi32.
21465 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21466 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21467 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21468 assert(!IsSigned && "Expected unsigned conversion!");
21469 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21470 }
21471
21472 return SDValue();
21473 }
21474
21475 assert(!VT.isVector());
21476
21477 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21478
21479 if (!IsSigned && UseSSEReg) {
21480 // Conversions from f32/f64 with AVX512 should be legal.
21481 if (Subtarget.hasAVX512())
21482 return Op;
21483
21484 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21485 // behaves on out of range inputs to generate optimized conversions.
21486 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21487 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21488 unsigned DstBits = VT.getScalarSizeInBits();
21489 APInt UIntLimit = APInt::getSignMask(DstBits);
21490 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21491 DAG.getConstant(UIntLimit, dl, VT));
21492 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21493
21494 // Calculate the converted result for values in the range:
21495 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21496 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21497 SDValue Small =
21498 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21499 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21500 SDValue Big = DAG.getNode(
21501 X86ISD::CVTTS2SI, dl, VT,
21502 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21503 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21504
21505 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21506 // and only if the value was out of range. So we can use that
21507 // as our indicator that we rather use "Big" instead of "Small".
21508 //
21509 // Use "Small" if "IsOverflown" has all bits cleared
21510 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21511 SDValue IsOverflown = DAG.getNode(
21512 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21513 return DAG.getNode(ISD::OR, dl, VT, Small,
21514 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21515 }
21516
21517 // Use default expansion for i64.
21518 if (VT == MVT::i64)
21519 return SDValue();
21520
21521 assert(VT == MVT::i32 && "Unexpected VT!");
21522
21523 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21524 // FIXME: This does not generate an invalid exception if the input does not
21525 // fit in i32. PR44019
21526 if (Subtarget.is64Bit()) {
21527 if (IsStrict) {
21528 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21529 {Chain, Src});
21530 Chain = Res.getValue(1);
21531 } else
21532 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21533
21534 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21535 if (IsStrict)
21536 return DAG.getMergeValues({Res, Chain}, dl);
21537 return Res;
21538 }
21539
21540 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21541 // use fisttp which will be handled later.
21542 if (!Subtarget.hasSSE3())
21543 return SDValue();
21544 }
21545
21546 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21547 // FIXME: This does not generate an invalid exception if the input does not
21548 // fit in i16. PR44019
21549 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21550 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21551 if (IsStrict) {
21552 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21553 {Chain, Src});
21554 Chain = Res.getValue(1);
21555 } else
21556 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21557
21558 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21559 if (IsStrict)
21560 return DAG.getMergeValues({Res, Chain}, dl);
21561 return Res;
21562 }
21563
21564 // If this is a FP_TO_SINT using SSEReg we're done.
21565 if (UseSSEReg && IsSigned)
21566 return Op;
21567
21568 // fp128 needs to use a libcall.
21569 if (SrcVT == MVT::f128) {
21570 RTLIB::Libcall LC;
21571 if (IsSigned)
21572 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21573 else
21574 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21575
21576 MakeLibCallOptions CallOptions;
21577 std::pair<SDValue, SDValue> Tmp =
21578 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21579
21580 if (IsStrict)
21581 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21582
21583 return Tmp.first;
21584 }
21585
21586 // Fall back to X87.
21587 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21588 if (IsStrict)
21589 return DAG.getMergeValues({V, Chain}, dl);
21590 return V;
21591 }
21592
21593 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21594}
21595
21596SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21597 SelectionDAG &DAG) const {
21598 SDValue Src = Op.getOperand(0);
21599 EVT DstVT = Op.getSimpleValueType();
21600 MVT SrcVT = Src.getSimpleValueType();
21601
21602 if (SrcVT.isVector())
21603 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21604
21605 if (SrcVT == MVT::f16)
21606 return SDValue();
21607
21608 // If the source is in an SSE register, the node is Legal.
21609 if (isScalarFPTypeInSSEReg(SrcVT))
21610 return Op;
21611
21612 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21613}
21614
21615SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21616 SelectionDAG &DAG) const {
21617 EVT DstVT = N->getValueType(0);
21618 SDValue Src = N->getOperand(0);
21619 EVT SrcVT = Src.getValueType();
21620
21621 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21622 // f16 must be promoted before using the lowering in this routine.
21623 // fp128 does not use this lowering.
21624 return SDValue();
21625 }
21626
21627 SDLoc DL(N);
21628 SDValue Chain = DAG.getEntryNode();
21629
21630 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21631
21632 // If we're converting from SSE, the stack slot needs to hold both types.
21633 // Otherwise it only needs to hold the DstVT.
21634 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21635 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21636 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21637 MachinePointerInfo MPI =
21639
21640 if (UseSSE) {
21641 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21642 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21643 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21644 SDValue Ops[] = { Chain, StackPtr };
21645
21646 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21647 /*Align*/ std::nullopt,
21649 Chain = Src.getValue(1);
21650 }
21651
21652 SDValue StoreOps[] = { Chain, Src, StackPtr };
21653 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21654 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21656
21657 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21658}
21659
21660SDValue
21661X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21662 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21663 // but making use of X86 specifics to produce better instruction sequences.
21664 SDNode *Node = Op.getNode();
21665 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21666 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21667 SDLoc dl(SDValue(Node, 0));
21668 SDValue Src = Node->getOperand(0);
21669
21670 // There are three types involved here: SrcVT is the source floating point
21671 // type, DstVT is the type of the result, and TmpVT is the result of the
21672 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21673 // DstVT).
21674 EVT SrcVT = Src.getValueType();
21675 EVT DstVT = Node->getValueType(0);
21676 EVT TmpVT = DstVT;
21677
21678 // This code is only for floats and doubles. Fall back to generic code for
21679 // anything else.
21680 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21681 return SDValue();
21682
21683 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21684 unsigned SatWidth = SatVT.getScalarSizeInBits();
21685 unsigned DstWidth = DstVT.getScalarSizeInBits();
21686 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21687 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21688 "Expected saturation width smaller than result width");
21689
21690 // Promote result of FP_TO_*INT to at least 32 bits.
21691 if (TmpWidth < 32) {
21692 TmpVT = MVT::i32;
21693 TmpWidth = 32;
21694 }
21695
21696 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21697 // us to use a native signed conversion instead.
21698 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21699 TmpVT = MVT::i64;
21700 TmpWidth = 64;
21701 }
21702
21703 // If the saturation width is smaller than the size of the temporary result,
21704 // we can always use signed conversion, which is native.
21705 if (SatWidth < TmpWidth)
21706 FpToIntOpcode = ISD::FP_TO_SINT;
21707
21708 // Determine minimum and maximum integer values and their corresponding
21709 // floating-point values.
21710 APInt MinInt, MaxInt;
21711 if (IsSigned) {
21712 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21713 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21714 } else {
21715 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21716 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21717 }
21718
21719 const fltSemantics &Sem = SrcVT.getFltSemantics();
21720 APFloat MinFloat(Sem);
21721 APFloat MaxFloat(Sem);
21722
21723 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21724 MinInt, IsSigned, APFloat::rmTowardZero);
21725 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21726 MaxInt, IsSigned, APFloat::rmTowardZero);
21727 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21728 && !(MaxStatus & APFloat::opStatus::opInexact);
21729
21730 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21731 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21732
21733 // If the integer bounds are exactly representable as floats, emit a
21734 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21735 if (AreExactFloatBounds) {
21736 if (DstVT != TmpVT) {
21737 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21738 SDValue MinClamped = DAG.getNode(
21739 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21740 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21741 SDValue BothClamped = DAG.getNode(
21742 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21743 // Convert clamped value to integer.
21744 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21745
21746 // NaN will become INDVAL, with the top bit set and the rest zero.
21747 // Truncation will discard the top bit, resulting in zero.
21748 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21749 }
21750
21751 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21752 SDValue MinClamped = DAG.getNode(
21753 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21754 // Clamp by MaxFloat from above. NaN cannot occur.
21755 SDValue BothClamped = DAG.getNode(
21756 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21757 // Convert clamped value to integer.
21758 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21759
21760 if (!IsSigned) {
21761 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21762 // which is zero.
21763 return FpToInt;
21764 }
21765
21766 // Otherwise, select zero if Src is NaN.
21767 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21768 return DAG.getSelectCC(
21769 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21770 }
21771
21772 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21773 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21774
21775 // Result of direct conversion, which may be selected away.
21776 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21777
21778 if (DstVT != TmpVT) {
21779 // NaN will become INDVAL, with the top bit set and the rest zero.
21780 // Truncation will discard the top bit, resulting in zero.
21781 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21782 }
21783
21784 SDValue Select = FpToInt;
21785 // For signed conversions where we saturate to the same size as the
21786 // result type of the fptoi instructions, INDVAL coincides with integer
21787 // minimum, so we don't need to explicitly check it.
21788 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21789 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21790 // MinInt if Src is NaN.
21791 Select = DAG.getSelectCC(
21792 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21793 }
21794
21795 // If Src OGT MaxFloat, select MaxInt.
21796 Select = DAG.getSelectCC(
21797 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21798
21799 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21800 // is already zero. The promoted case was already handled above.
21801 if (!IsSigned || DstVT != TmpVT) {
21802 return Select;
21803 }
21804
21805 // Otherwise, select 0 if Src is NaN.
21806 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21807 return DAG.getSelectCC(
21808 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21809}
21810
21811SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21812 bool IsStrict = Op->isStrictFPOpcode();
21813
21814 SDLoc DL(Op);
21815 MVT VT = Op.getSimpleValueType();
21816 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21817 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21818 MVT SVT = In.getSimpleValueType();
21819
21820 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21821 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21822 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21823 !Subtarget.getTargetTriple().isOSDarwin()))
21824 return SDValue();
21825
21826 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21827 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21828 return Op;
21829
21830 if (SVT == MVT::f16) {
21831 if (Subtarget.hasFP16())
21832 return Op;
21833
21834 if (VT != MVT::f32) {
21835 if (IsStrict)
21836 return DAG.getNode(
21837 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21838 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21839 {MVT::f32, MVT::Other}, {Chain, In})});
21840
21841 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21842 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21843 }
21844
21845 if (!Subtarget.hasF16C()) {
21846 if (!Subtarget.getTargetTriple().isOSDarwin())
21847 return SDValue();
21848
21849 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21850
21851 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21853 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21854
21855 In = DAG.getBitcast(MVT::i16, In);
21858 Entry.Node = In;
21859 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21860 Entry.IsSExt = false;
21861 Entry.IsZExt = true;
21862 Args.push_back(Entry);
21863
21865 getLibcallName(RTLIB::FPEXT_F16_F32),
21867 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21868 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21869 std::move(Args));
21870
21871 SDValue Res;
21872 std::tie(Res,Chain) = LowerCallTo(CLI);
21873 if (IsStrict)
21874 Res = DAG.getMergeValues({Res, Chain}, DL);
21875
21876 return Res;
21877 }
21878
21879 In = DAG.getBitcast(MVT::i16, In);
21880 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21881 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21882 DAG.getVectorIdxConstant(0, DL));
21883 SDValue Res;
21884 if (IsStrict) {
21885 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21886 {Chain, In});
21887 Chain = Res.getValue(1);
21888 } else {
21889 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21890 DAG.getTargetConstant(4, DL, MVT::i32));
21891 }
21892 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21893 DAG.getVectorIdxConstant(0, DL));
21894 if (IsStrict)
21895 return DAG.getMergeValues({Res, Chain}, DL);
21896 return Res;
21897 }
21898
21899 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21900 return Op;
21901
21902 if (SVT.getVectorElementType() == MVT::f16) {
21903 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21904 return Op;
21905 assert(Subtarget.hasF16C() && "Unexpected features!");
21906 if (SVT == MVT::v2f16)
21907 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21908 DAG.getUNDEF(MVT::v2f16));
21909 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21910 DAG.getUNDEF(MVT::v4f16));
21911 if (IsStrict)
21912 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21913 {Op->getOperand(0), Res});
21914 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21915 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21916 return Op;
21917 }
21918
21919 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21920
21921 SDValue Res =
21922 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21923 if (IsStrict)
21924 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21925 {Op->getOperand(0), Res});
21926 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21927}
21928
21929SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21930 bool IsStrict = Op->isStrictFPOpcode();
21931
21932 SDLoc DL(Op);
21933 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21934 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21935 MVT VT = Op.getSimpleValueType();
21936 MVT SVT = In.getSimpleValueType();
21937
21938 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21939 return SDValue();
21940
21941 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21942 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21943 if (!Subtarget.getTargetTriple().isOSDarwin())
21944 return SDValue();
21945
21946 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21948 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21949
21952 Entry.Node = In;
21953 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21954 Entry.IsSExt = false;
21955 Entry.IsZExt = true;
21956 Args.push_back(Entry);
21957
21959 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21960 : RTLIB::FPROUND_F32_F16),
21962 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21963 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21964 std::move(Args));
21965
21966 SDValue Res;
21967 std::tie(Res, Chain) = LowerCallTo(CLI);
21968
21969 Res = DAG.getBitcast(MVT::f16, Res);
21970
21971 if (IsStrict)
21972 Res = DAG.getMergeValues({Res, Chain}, DL);
21973
21974 return Res;
21975 }
21976
21977 if (VT.getScalarType() == MVT::bf16) {
21978 if (SVT.getScalarType() == MVT::f32 &&
21979 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21980 Subtarget.hasAVXNECONVERT()))
21981 return Op;
21982 return SDValue();
21983 }
21984
21985 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21986 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21987 return SDValue();
21988
21989 if (VT.isVector())
21990 return Op;
21991
21992 SDValue Res;
21994 MVT::i32);
21995 if (IsStrict) {
21996 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21997 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21998 DAG.getVectorIdxConstant(0, DL));
21999 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22000 {Chain, Res, Rnd});
22001 Chain = Res.getValue(1);
22002 } else {
22003 // FIXME: Should we use zeros for upper elements for non-strict?
22004 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22005 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22006 }
22007
22008 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22009 DAG.getVectorIdxConstant(0, DL));
22010 Res = DAG.getBitcast(MVT::f16, Res);
22011
22012 if (IsStrict)
22013 return DAG.getMergeValues({Res, Chain}, DL);
22014
22015 return Res;
22016 }
22017
22018 return Op;
22019}
22020
22022 bool IsStrict = Op->isStrictFPOpcode();
22023 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22024 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22025 "Unexpected VT!");
22026
22027 SDLoc dl(Op);
22028 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22029 DAG.getConstant(0, dl, MVT::v8i16), Src,
22030 DAG.getVectorIdxConstant(0, dl));
22031
22032 SDValue Chain;
22033 if (IsStrict) {
22034 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22035 {Op.getOperand(0), Res});
22036 Chain = Res.getValue(1);
22037 } else {
22038 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22039 }
22040
22041 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22042 DAG.getVectorIdxConstant(0, dl));
22043
22044 if (IsStrict)
22045 return DAG.getMergeValues({Res, Chain}, dl);
22046
22047 return Res;
22048}
22049
22051 bool IsStrict = Op->isStrictFPOpcode();
22052 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22053 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22054 "Unexpected VT!");
22055
22056 SDLoc dl(Op);
22057 SDValue Res, Chain;
22058 if (IsStrict) {
22059 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22060 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22061 DAG.getVectorIdxConstant(0, dl));
22062 Res = DAG.getNode(
22063 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22064 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22065 Chain = Res.getValue(1);
22066 } else {
22067 // FIXME: Should we use zeros for upper elements for non-strict?
22068 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22069 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22070 DAG.getTargetConstant(4, dl, MVT::i32));
22071 }
22072
22073 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22074 DAG.getVectorIdxConstant(0, dl));
22075
22076 if (IsStrict)
22077 return DAG.getMergeValues({Res, Chain}, dl);
22078
22079 return Res;
22080}
22081
22082SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22083 SelectionDAG &DAG) const {
22084 SDLoc DL(Op);
22085
22086 MVT SVT = Op.getOperand(0).getSimpleValueType();
22087 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22088 Subtarget.hasAVXNECONVERT())) {
22089 SDValue Res;
22090 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22091 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22092 Res = DAG.getBitcast(MVT::v8i16, Res);
22093 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22094 DAG.getVectorIdxConstant(0, DL));
22095 }
22096
22097 MakeLibCallOptions CallOptions;
22098 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22099 SDValue Res =
22100 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22101 return DAG.getBitcast(MVT::i16, Res);
22102}
22103
22104/// Depending on uarch and/or optimizing for size, we might prefer to use a
22105/// vector operation in place of the typical scalar operation.
22107 SelectionDAG &DAG,
22108 const X86Subtarget &Subtarget) {
22109 // If both operands have other uses, this is probably not profitable.
22110 SDValue LHS = Op.getOperand(0);
22111 SDValue RHS = Op.getOperand(1);
22112 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22113 return Op;
22114
22115 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22116 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22117 if (IsFP && !Subtarget.hasSSE3())
22118 return Op;
22119 if (!IsFP && !Subtarget.hasSSSE3())
22120 return Op;
22121
22122 // Extract from a common vector.
22123 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22124 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22125 LHS.getOperand(0) != RHS.getOperand(0) ||
22126 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22127 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22128 !shouldUseHorizontalOp(true, DAG, Subtarget))
22129 return Op;
22130
22131 // Allow commuted 'hadd' ops.
22132 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22133 unsigned HOpcode;
22134 switch (Op.getOpcode()) {
22135 // clang-format off
22136 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22137 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22138 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22139 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22140 default:
22141 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22142 // clang-format on
22143 }
22144 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22145 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22146 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22147 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22148 std::swap(LExtIndex, RExtIndex);
22149
22150 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22151 return Op;
22152
22153 SDValue X = LHS.getOperand(0);
22154 EVT VecVT = X.getValueType();
22155 unsigned BitWidth = VecVT.getSizeInBits();
22156 unsigned NumLanes = BitWidth / 128;
22157 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22158 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22159 "Not expecting illegal vector widths here");
22160
22161 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22162 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22163 if (BitWidth == 256 || BitWidth == 512) {
22164 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22165 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22166 LExtIndex %= NumEltsPerLane;
22167 }
22168
22169 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22170 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22171 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22172 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22173 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22174 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22175 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22176}
22177
22178/// Depending on uarch and/or optimizing for size, we might prefer to use a
22179/// vector operation in place of the typical scalar operation.
22180SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22181 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22182 "Only expecting float/double");
22183 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22184}
22185
22186/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22187/// This mode isn't supported in hardware on X86. But as long as we aren't
22188/// compiling with trapping math, we can emulate this with
22189/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22191 SDValue N0 = Op.getOperand(0);
22192 SDLoc dl(Op);
22193 MVT VT = Op.getSimpleValueType();
22194
22195 // N0 += copysign(nextafter(0.5, 0.0), N0)
22196 const fltSemantics &Sem = VT.getFltSemantics();
22197 bool Ignored;
22198 APFloat Point5Pred = APFloat(0.5f);
22199 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22200 Point5Pred.next(/*nextDown*/true);
22201
22202 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22203 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22204 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22205
22206 // Truncate the result to remove fraction.
22207 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22208}
22209
22210/// The only differences between FABS and FNEG are the mask and the logic op.
22211/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22213 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22214 "Wrong opcode for lowering FABS or FNEG.");
22215
22216 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22217
22218 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22219 // into an FNABS. We'll lower the FABS after that if it is still in use.
22220 if (IsFABS)
22221 for (SDNode *User : Op->users())
22222 if (User->getOpcode() == ISD::FNEG)
22223 return Op;
22224
22225 SDLoc dl(Op);
22226 MVT VT = Op.getSimpleValueType();
22227
22228 bool IsF128 = (VT == MVT::f128);
22229 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22231 "Unexpected type in LowerFABSorFNEG");
22232
22233 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22234 // decide if we should generate a 16-byte constant mask when we only need 4 or
22235 // 8 bytes for the scalar case.
22236
22237 // There are no scalar bitwise logical SSE/AVX instructions, so we
22238 // generate a 16-byte vector constant and logic op even for the scalar case.
22239 // Using a 16-byte mask allows folding the load of the mask with
22240 // the logic op, so it can save (~4 bytes) on code size.
22241 bool IsFakeVector = !VT.isVector() && !IsF128;
22242 MVT LogicVT = VT;
22243 if (IsFakeVector)
22244 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22245 : (VT == MVT::f32) ? MVT::v4f32
22246 : MVT::v8f16;
22247
22248 unsigned EltBits = VT.getScalarSizeInBits();
22249 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22250 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22251 APInt::getSignMask(EltBits);
22252 const fltSemantics &Sem = VT.getFltSemantics();
22253 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22254
22255 SDValue Op0 = Op.getOperand(0);
22256 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22257 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22258 IsFNABS ? X86ISD::FOR :
22260 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22261
22262 if (VT.isVector() || IsF128)
22263 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22264
22265 // For the scalar case extend to a 128-bit vector, perform the logic op,
22266 // and extract the scalar result back out.
22267 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22268 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22269 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22270 DAG.getVectorIdxConstant(0, dl));
22271}
22272
22274 SDValue Mag = Op.getOperand(0);
22275 SDValue Sign = Op.getOperand(1);
22276 SDLoc dl(Op);
22277
22278 // If the sign operand is smaller, extend it first.
22279 MVT VT = Op.getSimpleValueType();
22280 if (Sign.getSimpleValueType().bitsLT(VT))
22281 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22282
22283 // And if it is bigger, shrink it first.
22284 if (Sign.getSimpleValueType().bitsGT(VT))
22285 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22286 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22287
22288 // At this point the operands and the result should have the same
22289 // type, and that won't be f80 since that is not custom lowered.
22290 bool IsF128 = (VT == MVT::f128);
22291 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22293 "Unexpected type in LowerFCOPYSIGN");
22294
22295 const fltSemantics &Sem = VT.getFltSemantics();
22296
22297 // Perform all scalar logic operations as 16-byte vectors because there are no
22298 // scalar FP logic instructions in SSE.
22299 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22300 // unnecessary splats, but we might miss load folding opportunities. Should
22301 // this decision be based on OptimizeForSize?
22302 bool IsFakeVector = !VT.isVector() && !IsF128;
22303 MVT LogicVT = VT;
22304 if (IsFakeVector)
22305 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22306 : (VT == MVT::f32) ? MVT::v4f32
22307 : MVT::v8f16;
22308
22309 // The mask constants are automatically splatted for vector types.
22310 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22311 SDValue SignMask = DAG.getConstantFP(
22312 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22313 SDValue MagMask = DAG.getConstantFP(
22314 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22315
22316 // First, clear all bits but the sign bit from the second operand (sign).
22317 if (IsFakeVector)
22318 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22319 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22320
22321 // Next, clear the sign bit from the first operand (magnitude).
22322 // TODO: If we had general constant folding for FP logic ops, this check
22323 // wouldn't be necessary.
22324 SDValue MagBits;
22325 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22326 APFloat APF = Op0CN->getValueAPF();
22327 APF.clearSign();
22328 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22329 } else {
22330 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22331 if (IsFakeVector)
22332 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22333 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22334 }
22335
22336 // OR the magnitude value with the sign bit.
22337 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22338 return !IsFakeVector ? Or
22339 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22340 DAG.getVectorIdxConstant(0, dl));
22341}
22342
22344 SDValue N0 = Op.getOperand(0);
22345 SDLoc dl(Op);
22346 MVT VT = Op.getSimpleValueType();
22347
22348 MVT OpVT = N0.getSimpleValueType();
22349 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22350 "Unexpected type for FGETSIGN");
22351
22352 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22353 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22354 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22355 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22356 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22357 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22358 return Res;
22359}
22360
22361/// Helper for attempting to create a X86ISD::BT node.
22362static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22363 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22364 // instruction. Since the shift amount is in-range-or-undefined, we know
22365 // that doing a bittest on the i32 value is ok. We extend to i32 because
22366 // the encoding for the i16 version is larger than the i32 version.
22367 // Also promote i16 to i32 for performance / code size reason.
22368 if (Src.getValueType().getScalarSizeInBits() < 32)
22369 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22370
22371 // No legal type found, give up.
22372 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22373 return SDValue();
22374
22375 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22376 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22377 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22378 // known to be zero.
22379 if (Src.getValueType() == MVT::i64 &&
22380 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22381 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22382
22383 // If the operand types disagree, extend the shift amount to match. Since
22384 // BT ignores high bits (like shifts) we can use anyextend.
22385 if (Src.getValueType() != BitNo.getValueType()) {
22386 // Peek through a mask/modulo operation.
22387 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22388 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22389 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22390 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22391 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22392 BitNo.getOperand(0)),
22393 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22394 BitNo.getOperand(1)));
22395 else
22396 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22397 }
22398
22399 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22400}
22401
22402/// Helper for creating a X86ISD::SETCC node.
22404 SelectionDAG &DAG) {
22405 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22406 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22407}
22408
22409/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22410/// recognizable memcmp expansion.
22411static bool isOrXorXorTree(SDValue X, bool Root = true) {
22412 if (X.getOpcode() == ISD::OR)
22413 return isOrXorXorTree(X.getOperand(0), false) &&
22414 isOrXorXorTree(X.getOperand(1), false);
22415 if (Root)
22416 return false;
22417 return X.getOpcode() == ISD::XOR;
22418}
22419
22420/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22421/// expansion.
22422template <typename F>
22424 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22425 SDValue Op0 = X.getOperand(0);
22426 SDValue Op1 = X.getOperand(1);
22427 if (X.getOpcode() == ISD::OR) {
22428 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22429 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22430 if (VecVT != CmpVT)
22431 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22432 if (HasPT)
22433 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22434 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22435 }
22436 if (X.getOpcode() == ISD::XOR) {
22437 SDValue A = SToV(Op0);
22438 SDValue B = SToV(Op1);
22439 if (VecVT != CmpVT)
22440 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22441 if (HasPT)
22442 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22443 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22444 }
22445 llvm_unreachable("Impossible");
22446}
22447
22448/// Try to map a 128-bit or larger integer comparison to vector instructions
22449/// before type legalization splits it up into chunks.
22452 const SDLoc &DL,
22453 SelectionDAG &DAG,
22454 const X86Subtarget &Subtarget) {
22455 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22456
22457 // We're looking for an oversized integer equality comparison.
22458 EVT OpVT = X.getValueType();
22459 unsigned OpSize = OpVT.getSizeInBits();
22460 if (!OpVT.isScalarInteger() || OpSize < 128)
22461 return SDValue();
22462
22463 // Ignore a comparison with zero because that gets special treatment in
22464 // EmitTest(). But make an exception for the special case of a pair of
22465 // logically-combined vector-sized operands compared to zero. This pattern may
22466 // be generated by the memcmp expansion pass with oversized integer compares
22467 // (see PR33325).
22468 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22469 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22470 return SDValue();
22471
22472 // Don't perform this combine if constructing the vector will be expensive.
22473 auto IsVectorBitCastCheap = [](SDValue X) {
22475 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22476 X.getOpcode() == ISD::LOAD;
22477 };
22478 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22479 !IsOrXorXorTreeCCZero)
22480 return SDValue();
22481
22482 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22483 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22484 // Otherwise use PCMPEQ (plus AND) and mask testing.
22485 bool NoImplicitFloatOps =
22487 Attribute::NoImplicitFloat);
22488 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22489 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22490 (OpSize == 256 && Subtarget.hasAVX()) ||
22491 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22492 bool HasPT = Subtarget.hasSSE41();
22493
22494 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22495 // vector registers are essentially free. (Technically, widening registers
22496 // prevents load folding, but the tradeoff is worth it.)
22497 bool PreferKOT = Subtarget.preferMaskRegisters();
22498 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22499
22500 EVT VecVT = MVT::v16i8;
22501 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22502 if (OpSize == 256) {
22503 VecVT = MVT::v32i8;
22504 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22505 }
22506 EVT CastVT = VecVT;
22507 bool NeedsAVX512FCast = false;
22508 if (OpSize == 512 || NeedZExt) {
22509 if (Subtarget.hasBWI()) {
22510 VecVT = MVT::v64i8;
22511 CmpVT = MVT::v64i1;
22512 if (OpSize == 512)
22513 CastVT = VecVT;
22514 } else {
22515 VecVT = MVT::v16i32;
22516 CmpVT = MVT::v16i1;
22517 CastVT = OpSize == 512 ? VecVT
22518 : OpSize == 256 ? MVT::v8i32
22519 : MVT::v4i32;
22520 NeedsAVX512FCast = true;
22521 }
22522 }
22523
22524 auto ScalarToVector = [&](SDValue X) -> SDValue {
22525 bool TmpZext = false;
22526 EVT TmpCastVT = CastVT;
22527 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22528 SDValue OrigX = X.getOperand(0);
22529 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22530 if (OrigSize < OpSize) {
22531 if (OrigSize == 128) {
22532 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22533 X = OrigX;
22534 TmpZext = true;
22535 } else if (OrigSize == 256) {
22536 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22537 X = OrigX;
22538 TmpZext = true;
22539 }
22540 }
22541 }
22542 X = DAG.getBitcast(TmpCastVT, X);
22543 if (!NeedZExt && !TmpZext)
22544 return X;
22545 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22546 DAG.getConstant(0, DL, VecVT), X,
22547 DAG.getVectorIdxConstant(0, DL));
22548 };
22549
22550 SDValue Cmp;
22551 if (IsOrXorXorTreeCCZero) {
22552 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22553 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22554 // Use 2 vector equality compares and 'and' the results before doing a
22555 // MOVMSK.
22556 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22557 } else {
22558 SDValue VecX = ScalarToVector(X);
22559 SDValue VecY = ScalarToVector(Y);
22560 if (VecVT != CmpVT) {
22561 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22562 } else if (HasPT) {
22563 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22564 } else {
22565 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22566 }
22567 }
22568 // AVX512 should emit a setcc that will lower to kortest.
22569 if (VecVT != CmpVT) {
22570 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22571 : CmpVT == MVT::v32i1 ? MVT::i32
22572 : MVT::i16;
22573 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22574 DAG.getConstant(0, DL, KRegVT), CC);
22575 }
22576 if (HasPT) {
22577 SDValue BCCmp =
22578 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22579 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22581 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22582 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22583 }
22584 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22585 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22586 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22587 assert(Cmp.getValueType() == MVT::v16i8 &&
22588 "Non 128-bit vector on pre-SSE41 target");
22589 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22590 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22591 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22592 }
22593
22594 return SDValue();
22595}
22596
22597/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22598/// style scalarized (associative) reduction patterns. Partial reductions
22599/// are supported when the pointer SrcMask is non-null.
22600/// TODO - move this to SelectionDAG?
22603 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22605 DenseMap<SDValue, APInt> SrcOpMap;
22606 EVT VT = MVT::Other;
22607
22608 // Recognize a special case where a vector is casted into wide integer to
22609 // test all 0s.
22610 assert(Op.getOpcode() == unsigned(BinOp) &&
22611 "Unexpected bit reduction opcode");
22612 Opnds.push_back(Op.getOperand(0));
22613 Opnds.push_back(Op.getOperand(1));
22614
22615 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22617 // BFS traverse all BinOp operands.
22618 if (I->getOpcode() == unsigned(BinOp)) {
22619 Opnds.push_back(I->getOperand(0));
22620 Opnds.push_back(I->getOperand(1));
22621 // Re-evaluate the number of nodes to be traversed.
22622 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22623 continue;
22624 }
22625
22626 // Quit if a non-EXTRACT_VECTOR_ELT
22627 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22628 return false;
22629
22630 // Quit if without a constant index.
22631 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22632 if (!Idx)
22633 return false;
22634
22635 SDValue Src = I->getOperand(0);
22636 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22637 if (M == SrcOpMap.end()) {
22638 VT = Src.getValueType();
22639 // Quit if not the same type.
22640 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22641 return false;
22642 unsigned NumElts = VT.getVectorNumElements();
22643 APInt EltCount = APInt::getZero(NumElts);
22644 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22645 SrcOps.push_back(Src);
22646 }
22647
22648 // Quit if element already used.
22649 unsigned CIdx = Idx->getZExtValue();
22650 if (M->second[CIdx])
22651 return false;
22652 M->second.setBit(CIdx);
22653 }
22654
22655 if (SrcMask) {
22656 // Collect the source partial masks.
22657 for (SDValue &SrcOp : SrcOps)
22658 SrcMask->push_back(SrcOpMap[SrcOp]);
22659 } else {
22660 // Quit if not all elements are used.
22661 for (const auto &I : SrcOpMap)
22662 if (!I.second.isAllOnes())
22663 return false;
22664 }
22665
22666 return true;
22667}
22668
22669// Helper function for comparing all bits of two vectors.
22671 ISD::CondCode CC, const APInt &OriginalMask,
22672 const X86Subtarget &Subtarget,
22673 SelectionDAG &DAG, X86::CondCode &X86CC) {
22674 EVT VT = LHS.getValueType();
22675 unsigned ScalarSize = VT.getScalarSizeInBits();
22676 if (OriginalMask.getBitWidth() != ScalarSize) {
22677 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22678 return SDValue();
22679 }
22680
22681 // Quit if not convertable to legal scalar or 128/256-bit vector.
22682 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22683 return SDValue();
22684
22685 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22686 if (VT.isFloatingPoint())
22687 return SDValue();
22688
22689 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22690 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22691
22692 APInt Mask = OriginalMask;
22693
22694 auto MaskBits = [&](SDValue Src) {
22695 if (Mask.isAllOnes())
22696 return Src;
22697 EVT SrcVT = Src.getValueType();
22698 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22699 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22700 };
22701
22702 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22703 if (VT.getSizeInBits() < 128) {
22704 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22705 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22706 if (IntVT != MVT::i64)
22707 return SDValue();
22708 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22709 MVT::i32, MVT::i32);
22710 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22711 MVT::i32, MVT::i32);
22712 SDValue Lo =
22713 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22714 SDValue Hi =
22715 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22716 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22717 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22718 DAG.getConstant(0, DL, MVT::i32));
22719 }
22720 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22721 DAG.getBitcast(IntVT, MaskBits(LHS)),
22722 DAG.getBitcast(IntVT, MaskBits(RHS)));
22723 }
22724
22725 // Without PTEST, a masked v2i64 or-reduction is not faster than
22726 // scalarization.
22727 bool UseKORTEST = Subtarget.useAVX512Regs();
22728 bool UsePTEST = Subtarget.hasSSE41();
22729 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22730 return SDValue();
22731
22732 // Split down to 128/256/512-bit vector.
22733 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22734
22735 // If the input vector has vector elements wider than the target test size,
22736 // then cast to <X x i64> so it will safely split.
22737 if (ScalarSize > TestSize) {
22738 if (!Mask.isAllOnes())
22739 return SDValue();
22740 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22741 LHS = DAG.getBitcast(VT, LHS);
22742 RHS = DAG.getBitcast(VT, RHS);
22743 Mask = APInt::getAllOnes(64);
22744 }
22745
22746 if (VT.getSizeInBits() > TestSize) {
22747 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22748 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22749 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22750 while (VT.getSizeInBits() > TestSize) {
22751 auto Split = DAG.SplitVector(LHS, DL);
22752 VT = Split.first.getValueType();
22753 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22754 }
22755 RHS = DAG.getAllOnesConstant(DL, VT);
22756 } else if (!UsePTEST && !KnownRHS.isZero()) {
22757 // MOVMSK Special Case:
22758 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22759 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22760 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22761 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22762 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22763 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22764 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22765 V = DAG.getSExtOrTrunc(V, DL, VT);
22766 while (VT.getSizeInBits() > TestSize) {
22767 auto Split = DAG.SplitVector(V, DL);
22768 VT = Split.first.getValueType();
22769 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22770 }
22771 V = DAG.getNOT(DL, V, VT);
22772 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22773 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22774 DAG.getConstant(0, DL, MVT::i32));
22775 } else {
22776 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22777 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22778 while (VT.getSizeInBits() > TestSize) {
22779 auto Split = DAG.SplitVector(V, DL);
22780 VT = Split.first.getValueType();
22781 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22782 }
22783 LHS = V;
22784 RHS = DAG.getConstant(0, DL, VT);
22785 }
22786 }
22787
22788 if (UseKORTEST && VT.is512BitVector()) {
22789 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22790 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22791 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22792 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22793 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22794 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22795 }
22796
22797 if (UsePTEST) {
22798 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22799 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22800 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22801 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22802 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22803 }
22804
22805 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22806 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22807 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22808 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22809 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22810 V = DAG.getNOT(DL, V, MaskVT);
22811 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22812 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22813 DAG.getConstant(0, DL, MVT::i32));
22814}
22815
22816// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22817// to CMP(MOVMSK(PCMPEQB(X,Y))).
22819 ISD::CondCode CC, const SDLoc &DL,
22820 const X86Subtarget &Subtarget,
22821 SelectionDAG &DAG,
22822 X86::CondCode &X86CC) {
22823 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22824
22825 bool CmpNull = isNullConstant(RHS);
22826 bool CmpAllOnes = isAllOnesConstant(RHS);
22827 if (!CmpNull && !CmpAllOnes)
22828 return SDValue();
22829
22830 SDValue Op = LHS;
22831 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22832 return SDValue();
22833
22834 // Check whether we're masking/truncating an OR-reduction result, in which
22835 // case track the masked bits.
22836 // TODO: Add CmpAllOnes support.
22837 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22838 if (CmpNull) {
22839 switch (Op.getOpcode()) {
22840 case ISD::TRUNCATE: {
22841 SDValue Src = Op.getOperand(0);
22842 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22843 Op.getScalarValueSizeInBits());
22844 Op = Src;
22845 break;
22846 }
22847 case ISD::AND: {
22848 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22849 Mask = Cst->getAPIntValue();
22850 Op = Op.getOperand(0);
22851 }
22852 break;
22853 }
22854 }
22855 }
22856
22857 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22858
22859 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22860 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22862 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22863 EVT VT = VecIns[0].getValueType();
22864 assert(llvm::all_of(VecIns,
22865 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22866 "Reduction source vector mismatch");
22867
22868 // Quit if not splittable to scalar/128/256/512-bit vector.
22869 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22870 return SDValue();
22871
22872 // If more than one full vector is evaluated, AND/OR them first before
22873 // PTEST.
22874 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22875 Slot += 2, e += 1) {
22876 // Each iteration will AND/OR 2 nodes and append the result until there is
22877 // only 1 node left, i.e. the final value of all vectors.
22878 SDValue LHS = VecIns[Slot];
22879 SDValue RHS = VecIns[Slot + 1];
22880 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22881 }
22882
22883 return LowerVectorAllEqual(DL, VecIns.back(),
22884 CmpNull ? DAG.getConstant(0, DL, VT)
22885 : DAG.getAllOnesConstant(DL, VT),
22886 CC, Mask, Subtarget, DAG, X86CC);
22887 }
22888
22889 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22890 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22891 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22892 ISD::NodeType BinOp;
22893 if (SDValue Match =
22894 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22895 EVT MatchVT = Match.getValueType();
22897 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22898 : DAG.getAllOnesConstant(DL, MatchVT),
22899 CC, Mask, Subtarget, DAG, X86CC);
22900 }
22901 }
22902
22903 if (Mask.isAllOnes()) {
22904 assert(!Op.getValueType().isVector() &&
22905 "Illegal vector type for reduction pattern");
22907 if (Src.getValueType().isFixedLengthVector() &&
22908 Src.getValueType().getScalarType() == MVT::i1) {
22909 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22910 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22911 if (Src.getOpcode() == ISD::SETCC) {
22912 SDValue LHS = Src.getOperand(0);
22913 SDValue RHS = Src.getOperand(1);
22914 EVT LHSVT = LHS.getValueType();
22915 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22916 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22917 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22918 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22919 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22920 X86CC);
22921 }
22922 }
22923 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22924 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22925 // Peek through truncation, mask the LSB and compare against zero/LSB.
22926 if (Src.getOpcode() == ISD::TRUNCATE) {
22927 SDValue Inner = Src.getOperand(0);
22928 EVT InnerVT = Inner.getValueType();
22929 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22930 unsigned BW = InnerVT.getScalarSizeInBits();
22931 APInt SrcMask = APInt(BW, 1);
22932 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22933 return LowerVectorAllEqual(DL, Inner,
22934 DAG.getConstant(Cmp, DL, InnerVT), CC,
22935 SrcMask, Subtarget, DAG, X86CC);
22936 }
22937 }
22938 }
22939 }
22940
22941 return SDValue();
22942}
22943
22944/// return true if \c Op has a use that doesn't just read flags.
22946 for (SDUse &Use : Op->uses()) {
22947 SDNode *User = Use.getUser();
22948 unsigned UOpNo = Use.getOperandNo();
22949 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22950 // Look past truncate.
22951 UOpNo = User->use_begin()->getOperandNo();
22952 User = User->use_begin()->getUser();
22953 }
22954
22955 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22956 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22957 return true;
22958 }
22959 return false;
22960}
22961
22962// Transform to an x86-specific ALU node with flags if there is a chance of
22963// using an RMW op or only the flags are used. Otherwise, leave
22964// the node alone and emit a 'cmp' or 'test' instruction.
22966 for (SDNode *U : Op->users())
22967 if (U->getOpcode() != ISD::CopyToReg &&
22968 U->getOpcode() != ISD::SETCC &&
22969 U->getOpcode() != ISD::STORE)
22970 return false;
22971
22972 return true;
22973}
22974
22975/// Emit nodes that will be selected as "test Op0,Op0", or something
22976/// equivalent.
22977static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22978 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22979 // CF and OF aren't always set the way we want. Determine which
22980 // of these we need.
22981 bool NeedCF = false;
22982 bool NeedOF = false;
22983 switch (X86CC) {
22984 default: break;
22985 case X86::COND_A: case X86::COND_AE:
22986 case X86::COND_B: case X86::COND_BE:
22987 NeedCF = true;
22988 break;
22989 case X86::COND_G: case X86::COND_GE:
22990 case X86::COND_L: case X86::COND_LE:
22991 case X86::COND_O: case X86::COND_NO: {
22992 // Check if we really need to set the
22993 // Overflow flag. If NoSignedWrap is present
22994 // that is not actually needed.
22995 switch (Op->getOpcode()) {
22996 case ISD::ADD:
22997 case ISD::SUB:
22998 case ISD::MUL:
22999 case ISD::SHL:
23000 if (Op.getNode()->getFlags().hasNoSignedWrap())
23001 break;
23002 [[fallthrough]];
23003 default:
23004 NeedOF = true;
23005 break;
23006 }
23007 break;
23008 }
23009 }
23010 // See if we can use the EFLAGS value from the operand instead of
23011 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23012 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23013 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23014 // Emit a CMP with 0, which is the TEST pattern.
23015 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23016 DAG.getConstant(0, dl, Op.getValueType()));
23017 }
23018 unsigned Opcode = 0;
23019 unsigned NumOperands = 0;
23020
23021 SDValue ArithOp = Op;
23022
23023 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23024 // which may be the result of a CAST. We use the variable 'Op', which is the
23025 // non-casted variable when we check for possible users.
23026 switch (ArithOp.getOpcode()) {
23027 case ISD::AND:
23028 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23029 // because a TEST instruction will be better.
23030 if (!hasNonFlagsUse(Op))
23031 break;
23032
23033 [[fallthrough]];
23034 case ISD::ADD:
23035 case ISD::SUB:
23036 case ISD::OR:
23037 case ISD::XOR:
23039 break;
23040
23041 // Otherwise use a regular EFLAGS-setting instruction.
23042 switch (ArithOp.getOpcode()) {
23043 // clang-format off
23044 default: llvm_unreachable("unexpected operator!");
23045 case ISD::ADD: Opcode = X86ISD::ADD; break;
23046 case ISD::SUB: Opcode = X86ISD::SUB; break;
23047 case ISD::XOR: Opcode = X86ISD::XOR; break;
23048 case ISD::AND: Opcode = X86ISD::AND; break;
23049 case ISD::OR: Opcode = X86ISD::OR; break;
23050 // clang-format on
23051 }
23052
23053 NumOperands = 2;
23054 break;
23055 case X86ISD::ADD:
23056 case X86ISD::SUB:
23057 case X86ISD::OR:
23058 case X86ISD::XOR:
23059 case X86ISD::AND:
23060 return SDValue(Op.getNode(), 1);
23061 case ISD::SSUBO:
23062 case ISD::USUBO: {
23063 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23064 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23065 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23066 Op->getOperand(1)).getValue(1);
23067 }
23068 default:
23069 break;
23070 }
23071
23072 if (Opcode == 0) {
23073 // Emit a CMP with 0, which is the TEST pattern.
23074 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23075 DAG.getConstant(0, dl, Op.getValueType()));
23076 }
23077 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23078 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23079
23080 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23081 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23082 return SDValue(New.getNode(), 1);
23083}
23084
23085/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23086/// equivalent.
23087static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
23088 const SDLoc &dl, SelectionDAG &DAG,
23089 const X86Subtarget &Subtarget) {
23090 if (isNullConstant(Op1))
23091 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23092
23093 EVT CmpVT = Op0.getValueType();
23094
23095 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23096 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23097
23098 // Only promote the compare up to I32 if it is a 16 bit operation
23099 // with an immediate. 16 bit immediates are to be avoided unless the target
23100 // isn't slowed down by length changing prefixes, we're optimizing for
23101 // codesize or the comparison is with a folded load.
23102 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23103 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23105 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23106 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23107 // Don't do this if the immediate can fit in 8-bits.
23108 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23109 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23110 unsigned ExtendOp =
23112 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23113 // For equality comparisons try to use SIGN_EXTEND if the input was
23114 // truncate from something with enough sign bits.
23115 if (Op0.getOpcode() == ISD::TRUNCATE) {
23116 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23117 ExtendOp = ISD::SIGN_EXTEND;
23118 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23119 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23120 ExtendOp = ISD::SIGN_EXTEND;
23121 }
23122 }
23123
23124 CmpVT = MVT::i32;
23125 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23126 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23127 }
23128 }
23129
23130 // Try to shrink i64 compares if the input has enough zero bits.
23131 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
23132 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23133 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23134 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23135 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23136 CmpVT = MVT::i32;
23137 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23138 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23139 }
23140
23141 // 0-x == y --> x+y == 0
23142 // 0-x != y --> x+y != 0
23143 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23144 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23145 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23146 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23147 return Add.getValue(1);
23148 }
23149
23150 // x == 0-y --> x+y == 0
23151 // x != 0-y --> x+y != 0
23152 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23153 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23154 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23155 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23156 return Add.getValue(1);
23157 }
23158
23159 // Use SUB instead of CMP to enable CSE between SUB and CMP.
23160 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23161 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
23162 return Sub.getValue(1);
23163}
23164
23166 EVT VT) const {
23167 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
23168}
23169
23170bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23171 SDNode *N, SDValue, SDValue IntPow2) const {
23172 if (N->getOpcode() == ISD::FDIV)
23173 return true;
23174
23175 EVT FPVT = N->getValueType(0);
23176 EVT IntVT = IntPow2.getValueType();
23177
23178 // This indicates a non-free bitcast.
23179 // TODO: This is probably overly conservative as we will need to scale the
23180 // integer vector anyways for the int->fp cast.
23181 if (FPVT.isVector() &&
23182 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23183 return false;
23184
23185 return true;
23186}
23187
23188/// Check if replacement of SQRT with RSQRT should be disabled.
23189bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23190 EVT VT = Op.getValueType();
23191
23192 // We don't need to replace SQRT with RSQRT for half type.
23193 if (VT.getScalarType() == MVT::f16)
23194 return true;
23195
23196 // We never want to use both SQRT and RSQRT instructions for the same input.
23197 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23198 return false;
23199
23200 if (VT.isVector())
23201 return Subtarget.hasFastVectorFSQRT();
23202 return Subtarget.hasFastScalarFSQRT();
23203}
23204
23205/// The minimum architected relative accuracy is 2^-12. We need one
23206/// Newton-Raphson step to have a good float result (24 bits of precision).
23207SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23208 SelectionDAG &DAG, int Enabled,
23209 int &RefinementSteps,
23210 bool &UseOneConstNR,
23211 bool Reciprocal) const {
23212 SDLoc DL(Op);
23213 EVT VT = Op.getValueType();
23214
23215 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23216 // It is likely not profitable to do this for f64 because a double-precision
23217 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23218 // instructions: convert to single, rsqrtss, convert back to double, refine
23219 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23220 // along with FMA, this could be a throughput win.
23221 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23222 // after legalize types.
23223 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23224 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23225 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23226 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23227 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23228 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23229 RefinementSteps = 1;
23230
23231 UseOneConstNR = false;
23232 // There is no FSQRT for 512-bits, but there is RSQRT14.
23233 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23234 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23235 if (RefinementSteps == 0 && !Reciprocal)
23236 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23237 return Estimate;
23238 }
23239
23240 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23241 Subtarget.hasFP16()) {
23242 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23243 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23244 RefinementSteps = 0;
23245
23246 if (VT == MVT::f16) {
23248 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23249 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23250 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23251 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23252 }
23253
23254 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23255 }
23256 return SDValue();
23257}
23258
23259/// The minimum architected relative accuracy is 2^-12. We need one
23260/// Newton-Raphson step to have a good float result (24 bits of precision).
23261SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23262 int Enabled,
23263 int &RefinementSteps) const {
23264 SDLoc DL(Op);
23265 EVT VT = Op.getValueType();
23266
23267 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23268 // It is likely not profitable to do this for f64 because a double-precision
23269 // reciprocal estimate with refinement on x86 prior to FMA requires
23270 // 15 instructions: convert to single, rcpss, convert back to double, refine
23271 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23272 // along with FMA, this could be a throughput win.
23273
23274 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23275 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23276 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23277 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23278 // Enable estimate codegen with 1 refinement step for vector division.
23279 // Scalar division estimates are disabled because they break too much
23280 // real-world code. These defaults are intended to match GCC behavior.
23281 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23282 return SDValue();
23283
23284 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23285 RefinementSteps = 1;
23286
23287 // There is no FSQRT for 512-bits, but there is RCP14.
23288 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23289 return DAG.getNode(Opcode, DL, VT, Op);
23290 }
23291
23292 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23293 Subtarget.hasFP16()) {
23294 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23295 RefinementSteps = 0;
23296
23297 if (VT == MVT::f16) {
23299 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23300 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23301 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23302 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23303 }
23304
23305 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23306 }
23307 return SDValue();
23308}
23309
23310/// If we have at least two divisions that use the same divisor, convert to
23311/// multiplication by a reciprocal. This may need to be adjusted for a given
23312/// CPU if a division's cost is not at least twice the cost of a multiplication.
23313/// This is because we still need one division to calculate the reciprocal and
23314/// then we need two multiplies by that reciprocal as replacements for the
23315/// original divisions.
23316unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23317 return 2;
23318}
23319
23320SDValue
23321X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23322 SelectionDAG &DAG,
23323 SmallVectorImpl<SDNode *> &Created) const {
23325 if (isIntDivCheap(N->getValueType(0), Attr))
23326 return SDValue(N,0); // Lower SDIV as SDIV
23327
23328 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23329 "Unexpected divisor!");
23330
23331 // Only perform this transform if CMOV is supported otherwise the select
23332 // below will become a branch.
23333 if (!Subtarget.canUseCMOV())
23334 return SDValue();
23335
23336 // fold (sdiv X, pow2)
23337 EVT VT = N->getValueType(0);
23338 // FIXME: Support i8.
23339 if (VT != MVT::i16 && VT != MVT::i32 &&
23340 !(Subtarget.is64Bit() && VT == MVT::i64))
23341 return SDValue();
23342
23343 // If the divisor is 2 or -2, the default expansion is better.
23344 if (Divisor == 2 ||
23345 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23346 return SDValue();
23347
23348 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23349}
23350
23351/// Result of 'and' is compared against zero. Change to a BT node if possible.
23352/// Returns the BT node and the condition code needed to use it.
23354 SelectionDAG &DAG, X86::CondCode &X86CC) {
23355 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23356 SDValue Op0 = And.getOperand(0);
23357 SDValue Op1 = And.getOperand(1);
23358 if (Op0.getOpcode() == ISD::TRUNCATE)
23359 Op0 = Op0.getOperand(0);
23360 if (Op1.getOpcode() == ISD::TRUNCATE)
23361 Op1 = Op1.getOperand(0);
23362
23363 SDValue Src, BitNo;
23364 if (Op1.getOpcode() == ISD::SHL)
23365 std::swap(Op0, Op1);
23366 if (Op0.getOpcode() == ISD::SHL) {
23367 if (isOneConstant(Op0.getOperand(0))) {
23368 // If we looked past a truncate, check that it's only truncating away
23369 // known zeros.
23370 unsigned BitWidth = Op0.getValueSizeInBits();
23371 unsigned AndBitWidth = And.getValueSizeInBits();
23372 if (BitWidth > AndBitWidth) {
23373 KnownBits Known = DAG.computeKnownBits(Op0);
23374 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23375 return SDValue();
23376 }
23377 Src = Op1;
23378 BitNo = Op0.getOperand(1);
23379 }
23380 } else if (Op1.getOpcode() == ISD::Constant) {
23381 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23382 uint64_t AndRHSVal = AndRHS->getZExtValue();
23383 SDValue AndLHS = Op0;
23384
23385 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23386 Src = AndLHS.getOperand(0);
23387 BitNo = AndLHS.getOperand(1);
23388 } else {
23389 // Use BT if the immediate can't be encoded in a TEST instruction or we
23390 // are optimizing for size and the immedaite won't fit in a byte.
23391 bool OptForSize = DAG.shouldOptForSize();
23392 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23393 isPowerOf2_64(AndRHSVal)) {
23394 Src = AndLHS;
23395 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23396 Src.getValueType());
23397 }
23398 }
23399 }
23400
23401 // No patterns found, give up.
23402 if (!Src.getNode())
23403 return SDValue();
23404
23405 // Remove any bit flip.
23406 if (isBitwiseNot(Src)) {
23407 Src = Src.getOperand(0);
23409 }
23410
23411 // Attempt to create the X86ISD::BT node.
23412 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23413 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23414 return BT;
23415 }
23416
23417 return SDValue();
23418}
23419
23420// Check if pre-AVX condcode can be performed by a single FCMP op.
23421static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23422 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23423}
23424
23425/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23426/// CMPs.
23427static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23428 SDValue &Op1, bool &IsAlwaysSignaling) {
23429 unsigned SSECC;
23430 bool Swap = false;
23431
23432 // SSE Condition code mapping:
23433 // 0 - EQ
23434 // 1 - LT
23435 // 2 - LE
23436 // 3 - UNORD
23437 // 4 - NEQ
23438 // 5 - NLT
23439 // 6 - NLE
23440 // 7 - ORD
23441 switch (SetCCOpcode) {
23442 // clang-format off
23443 default: llvm_unreachable("Unexpected SETCC condition");
23444 case ISD::SETOEQ:
23445 case ISD::SETEQ: SSECC = 0; break;
23446 case ISD::SETOGT:
23447 case ISD::SETGT: Swap = true; [[fallthrough]];
23448 case ISD::SETLT:
23449 case ISD::SETOLT: SSECC = 1; break;
23450 case ISD::SETOGE:
23451 case ISD::SETGE: Swap = true; [[fallthrough]];
23452 case ISD::SETLE:
23453 case ISD::SETOLE: SSECC = 2; break;
23454 case ISD::SETUO: SSECC = 3; break;
23455 case ISD::SETUNE:
23456 case ISD::SETNE: SSECC = 4; break;
23457 case ISD::SETULE: Swap = true; [[fallthrough]];
23458 case ISD::SETUGE: SSECC = 5; break;
23459 case ISD::SETULT: Swap = true; [[fallthrough]];
23460 case ISD::SETUGT: SSECC = 6; break;
23461 case ISD::SETO: SSECC = 7; break;
23462 case ISD::SETUEQ: SSECC = 8; break;
23463 case ISD::SETONE: SSECC = 12; break;
23464 // clang-format on
23465 }
23466 if (Swap)
23467 std::swap(Op0, Op1);
23468
23469 switch (SetCCOpcode) {
23470 default:
23471 IsAlwaysSignaling = true;
23472 break;
23473 case ISD::SETEQ:
23474 case ISD::SETOEQ:
23475 case ISD::SETUEQ:
23476 case ISD::SETNE:
23477 case ISD::SETONE:
23478 case ISD::SETUNE:
23479 case ISD::SETO:
23480 case ISD::SETUO:
23481 IsAlwaysSignaling = false;
23482 break;
23483 }
23484
23485 return SSECC;
23486}
23487
23488/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23489/// concatenate the result back.
23491 SelectionDAG &DAG, const SDLoc &dl) {
23492 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23493 "Unsupported VTs!");
23494 SDValue CC = DAG.getCondCode(Cond);
23495
23496 // Extract the LHS Lo/Hi vectors
23497 SDValue LHS1, LHS2;
23498 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23499
23500 // Extract the RHS Lo/Hi vectors
23501 SDValue RHS1, RHS2;
23502 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23503
23504 // Issue the operation on the smaller types and concatenate the result back
23505 EVT LoVT, HiVT;
23506 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23507 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23508 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23509 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23510}
23511
23513 SelectionDAG &DAG) {
23514 SDValue Op0 = Op.getOperand(0);
23515 SDValue Op1 = Op.getOperand(1);
23516 SDValue CC = Op.getOperand(2);
23517 MVT VT = Op.getSimpleValueType();
23518 assert(VT.getVectorElementType() == MVT::i1 &&
23519 "Cannot set masked compare for this operation");
23520
23521 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23522
23523 // Prefer SETGT over SETLT.
23524 if (SetCCOpcode == ISD::SETLT) {
23525 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23526 std::swap(Op0, Op1);
23527 }
23528
23529 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23530}
23531
23532/// Given a buildvector constant, return a new vector constant with each element
23533/// incremented or decremented. If incrementing or decrementing would result in
23534/// unsigned overflow or underflow or this is not a simple vector constant,
23535/// return an empty value.
23537 bool NSW) {
23538 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23539 if (!BV || !V.getValueType().isSimple())
23540 return SDValue();
23541
23542 MVT VT = V.getSimpleValueType();
23543 MVT EltVT = VT.getVectorElementType();
23544 unsigned NumElts = VT.getVectorNumElements();
23546 SDLoc DL(V);
23547 for (unsigned i = 0; i < NumElts; ++i) {
23548 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23549 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23550 return SDValue();
23551
23552 // Avoid overflow/underflow.
23553 const APInt &EltC = Elt->getAPIntValue();
23554 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23555 return SDValue();
23556 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23557 (!IsInc && EltC.isMinSignedValue())))
23558 return SDValue();
23559
23560 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23561 }
23562
23563 return DAG.getBuildVector(VT, DL, NewVecC);
23564}
23565
23566/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23567/// Op0 u<= Op1:
23568/// t = psubus Op0, Op1
23569/// pcmpeq t, <0..0>
23571 ISD::CondCode Cond, const SDLoc &dl,
23572 const X86Subtarget &Subtarget,
23573 SelectionDAG &DAG) {
23574 if (!Subtarget.hasSSE2())
23575 return SDValue();
23576
23577 MVT VET = VT.getVectorElementType();
23578 if (VET != MVT::i8 && VET != MVT::i16)
23579 return SDValue();
23580
23581 switch (Cond) {
23582 default:
23583 return SDValue();
23584 case ISD::SETULT: {
23585 // If the comparison is against a constant we can turn this into a
23586 // setule. With psubus, setule does not require a swap. This is
23587 // beneficial because the constant in the register is no longer
23588 // destructed as the destination so it can be hoisted out of a loop.
23589 // Only do this pre-AVX since vpcmp* is no longer destructive.
23590 if (Subtarget.hasAVX())
23591 return SDValue();
23592 SDValue ULEOp1 =
23593 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23594 if (!ULEOp1)
23595 return SDValue();
23596 Op1 = ULEOp1;
23597 break;
23598 }
23599 case ISD::SETUGT: {
23600 // If the comparison is against a constant, we can turn this into a setuge.
23601 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23602 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23603 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23604 SDValue UGEOp1 =
23605 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23606 if (!UGEOp1)
23607 return SDValue();
23608 Op1 = Op0;
23609 Op0 = UGEOp1;
23610 break;
23611 }
23612 // Psubus is better than flip-sign because it requires no inversion.
23613 case ISD::SETUGE:
23614 std::swap(Op0, Op1);
23615 break;
23616 case ISD::SETULE:
23617 break;
23618 }
23619
23620 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23621 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23622 DAG.getConstant(0, dl, VT));
23623}
23624
23625static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23626 SelectionDAG &DAG) {
23627 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23628 Op.getOpcode() == ISD::STRICT_FSETCCS;
23629 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23630 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23631 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23632 MVT VT = Op->getSimpleValueType(0);
23633 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23634 MVT OpVT = Op0.getSimpleValueType();
23635 SDLoc dl(Op);
23636
23637 if (OpVT.isFloatingPoint()) {
23638 MVT EltVT = OpVT.getVectorElementType();
23639 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
23640 EltVT == MVT::f64);
23641
23642 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23643 if (isSoftF16(EltVT, Subtarget)) {
23644 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
23645 return SDValue();
23646
23647 // Break 256-bit FP vector compare into smaller ones.
23648 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
23649 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23650
23651 // Break 512-bit FP vector compare into smaller ones.
23652 if (OpVT.is512BitVector())
23653 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23654
23655 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
23656 if (IsStrict) {
23657 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
23658 {Chain, Op0});
23659 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
23660 {Chain, Op1});
23661 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
23662 {Chain, Op0, Op1, CC});
23663 }
23664 MVT DVT = VT.getVectorElementType() == MVT::i16
23665 ? VT.changeVectorElementType(MVT::i32)
23666 : VT;
23667 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
23668 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
23669 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
23670 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
23671 }
23672
23673 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23674
23675 // If we have a strict compare with a vXi1 result and the input is 128/256
23676 // bits we can't use a masked compare unless we have VLX. If we use a wider
23677 // compare like we do for non-strict, we might trigger spurious exceptions
23678 // from the upper elements. Instead emit a AVX compare and convert to mask.
23679 unsigned Opc;
23680 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23681 (!IsStrict || Subtarget.hasVLX() ||
23683#ifndef NDEBUG
23684 unsigned Num = VT.getVectorNumElements();
23685 assert(Num <= 16 ||
23686 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
23687#endif
23688 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23689 } else {
23690 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23691 // The SSE/AVX packed FP comparison nodes are defined with a
23692 // floating-point vector result that matches the operand type. This allows
23693 // them to work with an SSE1 target (integer vector types are not legal).
23694 VT = Op0.getSimpleValueType();
23695 }
23696
23697 SDValue Cmp;
23698 bool IsAlwaysSignaling;
23699 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23700 if (!Subtarget.hasAVX()) {
23701 // TODO: We could use following steps to handle a quiet compare with
23702 // signaling encodings.
23703 // 1. Get ordered masks from a quiet ISD::SETO
23704 // 2. Use the masks to mask potential unordered elements in operand A, B
23705 // 3. Get the compare results of masked A, B
23706 // 4. Calculating final result using the mask and result from 3
23707 // But currently, we just fall back to scalar operations.
23708 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23709 return SDValue();
23710
23711 // Insert an extra signaling instruction to raise exception.
23712 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23713 SDValue SignalCmp = DAG.getNode(
23714 Opc, dl, {VT, MVT::Other},
23715 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23716 // FIXME: It seems we need to update the flags of all new strict nodes.
23717 // Otherwise, mayRaiseFPException in MI will return false due to
23718 // NoFPExcept = false by default. However, I didn't find it in other
23719 // patches.
23720 SignalCmp->setFlags(Op->getFlags());
23721 Chain = SignalCmp.getValue(1);
23722 }
23723
23724 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23725 // emit two comparisons and a logic op to tie them together.
23726 if (!cheapX86FSETCC_SSE(Cond)) {
23727 // LLVM predicate is SETUEQ or SETONE.
23728 unsigned CC0, CC1;
23729 unsigned CombineOpc;
23730 if (Cond == ISD::SETUEQ) {
23731 CC0 = 3; // UNORD
23732 CC1 = 0; // EQ
23733 CombineOpc = X86ISD::FOR;
23734 } else {
23736 CC0 = 7; // ORD
23737 CC1 = 4; // NEQ
23738 CombineOpc = X86ISD::FAND;
23739 }
23740
23741 SDValue Cmp0, Cmp1;
23742 if (IsStrict) {
23743 Cmp0 = DAG.getNode(
23744 Opc, dl, {VT, MVT::Other},
23745 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23746 Cmp1 = DAG.getNode(
23747 Opc, dl, {VT, MVT::Other},
23748 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23749 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23750 Cmp1.getValue(1));
23751 } else {
23752 Cmp0 = DAG.getNode(
23753 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23754 Cmp1 = DAG.getNode(
23755 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23756 }
23757 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23758 } else {
23759 if (IsStrict) {
23760 Cmp = DAG.getNode(
23761 Opc, dl, {VT, MVT::Other},
23762 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23763 Chain = Cmp.getValue(1);
23764 } else
23765 Cmp = DAG.getNode(
23766 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23767 }
23768 } else {
23769 // Handle all other FP comparisons here.
23770 if (IsStrict) {
23771 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23772 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23773 Cmp = DAG.getNode(
23774 Opc, dl, {VT, MVT::Other},
23775 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23776 Chain = Cmp.getValue(1);
23777 } else
23778 Cmp = DAG.getNode(
23779 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23780 }
23781
23782 if (VT.getFixedSizeInBits() >
23783 Op.getSimpleValueType().getFixedSizeInBits()) {
23784 // We emitted a compare with an XMM/YMM result. Finish converting to a
23785 // mask register using a vptestm.
23787 Cmp = DAG.getBitcast(CastVT, Cmp);
23788 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23789 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23790 } else {
23791 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23792 // the result type of SETCC. The bitcast is expected to be optimized
23793 // away during combining/isel.
23794 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23795 }
23796
23797 if (IsStrict)
23798 return DAG.getMergeValues({Cmp, Chain}, dl);
23799
23800 return Cmp;
23801 }
23802
23803 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23804
23805 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
23806 assert(VTOp0 == Op1.getSimpleValueType() &&
23807 "Expected operands with same type!");
23809 "Invalid number of packed elements for source and destination!");
23810
23811 // The non-AVX512 code below works under the assumption that source and
23812 // destination types are the same.
23813 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23814 "Value types for source and destination must be the same!");
23815
23816 // The result is boolean, but operands are int/float
23817 if (VT.getVectorElementType() == MVT::i1) {
23818 // In AVX-512 architecture setcc returns mask with i1 elements,
23819 // But there is no compare instruction for i8 and i16 elements in KNL.
23820 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23821 "Unexpected operand type");
23822 return LowerIntVSETCC_AVX512(Op, dl, DAG);
23823 }
23824
23825 // Lower using XOP integer comparisons.
23826 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23827 // Translate compare code to XOP PCOM compare mode.
23828 unsigned CmpMode = 0;
23829 switch (Cond) {
23830 // clang-format off
23831 default: llvm_unreachable("Unexpected SETCC condition");
23832 case ISD::SETULT:
23833 case ISD::SETLT: CmpMode = 0x00; break;
23834 case ISD::SETULE:
23835 case ISD::SETLE: CmpMode = 0x01; break;
23836 case ISD::SETUGT:
23837 case ISD::SETGT: CmpMode = 0x02; break;
23838 case ISD::SETUGE:
23839 case ISD::SETGE: CmpMode = 0x03; break;
23840 case ISD::SETEQ: CmpMode = 0x04; break;
23841 case ISD::SETNE: CmpMode = 0x05; break;
23842 // clang-format on
23843 }
23844
23845 // Are we comparing unsigned or signed integers?
23846 unsigned Opc =
23848
23849 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23850 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23851 }
23852
23853 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23854 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23856 SDValue BC0 = peekThroughBitcasts(Op0);
23857 if (BC0.getOpcode() == ISD::AND &&
23859 /*AllowUndefs=*/false)) {
23860 Cond = ISD::SETEQ;
23861 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23862 }
23863 }
23864
23865 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23866 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23867 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23869 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23870 unsigned BitWidth = VT.getScalarSizeInBits();
23871 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23872
23873 SDValue Result = Op0.getOperand(0);
23874 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23875 DAG.getConstant(ShiftAmt, dl, VT));
23876 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23877 DAG.getConstant(BitWidth - 1, dl, VT));
23878 return Result;
23879 }
23880 }
23881
23882 // Break 256-bit integer vector compare into smaller ones.
23883 if (VT.is256BitVector() && !Subtarget.hasInt256())
23884 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23885
23886 // Break 512-bit integer vector compare into smaller ones.
23887 // TODO: Try harder to use VPCMPx + VPMOV2x?
23888 if (VT.is512BitVector())
23889 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23890
23891 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23892 // not-of-PCMPEQ:
23893 // X != INT_MIN --> X >s INT_MIN
23894 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23895 // +X != 0 --> +X >s 0
23896 APInt ConstValue;
23897 if (Cond == ISD::SETNE &&
23898 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23899 if (ConstValue.isMinSignedValue())
23900 Cond = ISD::SETGT;
23901 else if (ConstValue.isMaxSignedValue())
23902 Cond = ISD::SETLT;
23903 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23904 Cond = ISD::SETGT;
23905 }
23906
23907 // If both operands are known non-negative, then an unsigned compare is the
23908 // same as a signed compare and there's no need to flip signbits.
23909 // TODO: We could check for more general simplifications here since we're
23910 // computing known bits.
23911 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23912 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23913
23914 // Special case: Use min/max operations for unsigned compares.
23915 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23917 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23918 TLI.isOperationLegal(ISD::UMIN, VT)) {
23919 // If we have a constant operand, increment/decrement it and change the
23920 // condition to avoid an invert.
23921 if (Cond == ISD::SETUGT) {
23922 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23923 if (SDValue UGTOp1 =
23924 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23925 Op1 = UGTOp1;
23926 Cond = ISD::SETUGE;
23927 }
23928 }
23929 if (Cond == ISD::SETULT) {
23930 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23931 if (SDValue ULTOp1 =
23932 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23933 Op1 = ULTOp1;
23934 Cond = ISD::SETULE;
23935 }
23936 }
23937 bool Invert = false;
23938 unsigned Opc;
23939 switch (Cond) {
23940 // clang-format off
23941 default: llvm_unreachable("Unexpected condition code");
23942 case ISD::SETUGT: Invert = true; [[fallthrough]];
23943 case ISD::SETULE: Opc = ISD::UMIN; break;
23944 case ISD::SETULT: Invert = true; [[fallthrough]];
23945 case ISD::SETUGE: Opc = ISD::UMAX; break;
23946 // clang-format on
23947 }
23948
23949 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23950 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23951
23952 // If the logical-not of the result is required, perform that now.
23953 if (Invert)
23954 Result = DAG.getNOT(dl, Result, VT);
23955
23956 return Result;
23957 }
23958
23959 // Try to use SUBUS and PCMPEQ.
23960 if (FlipSigns)
23961 if (SDValue V =
23962 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23963 return V;
23964
23965 // We are handling one of the integer comparisons here. Since SSE only has
23966 // GT and EQ comparisons for integer, swapping operands and multiple
23967 // operations may be required for some comparisons.
23968 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23970 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23972 bool Invert = Cond == ISD::SETNE ||
23974
23975 if (Swap)
23976 std::swap(Op0, Op1);
23977
23978 // Check that the operation in question is available (most are plain SSE2,
23979 // but PCMPGTQ and PCMPEQQ have different requirements).
23980 if (VT == MVT::v2i64) {
23981 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23982 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23983
23984 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23985 // the odd elements over the even elements.
23986 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23987 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23988 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23989
23990 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23991 static const int MaskHi[] = { 1, 1, 3, 3 };
23992 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23993
23994 return DAG.getBitcast(VT, Result);
23995 }
23996
23997 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23998 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23999 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24000
24001 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24002 static const int MaskHi[] = { 1, 1, 3, 3 };
24003 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24004
24005 return DAG.getBitcast(VT, Result);
24006 }
24007
24008 // If the i64 elements are sign-extended enough to be representable as i32
24009 // then we can compare the lower i32 bits and splat.
24010 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24011 DAG.ComputeNumSignBits(Op1) > 32) {
24012 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24013 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24014
24015 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24016 static const int MaskLo[] = {0, 0, 2, 2};
24017 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24018
24019 return DAG.getBitcast(VT, Result);
24020 }
24021
24022 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24023 // bits of the inputs before performing those operations. The lower
24024 // compare is always unsigned.
24025 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24026 : 0x0000000080000000ULL,
24027 dl, MVT::v2i64);
24028
24029 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24030 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24031
24032 // Cast everything to the right type.
24033 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24034 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24035
24036 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24037 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24038 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24039
24040 // Create masks for only the low parts/high parts of the 64 bit integers.
24041 static const int MaskHi[] = { 1, 1, 3, 3 };
24042 static const int MaskLo[] = { 0, 0, 2, 2 };
24043 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24044 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24045 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24046
24047 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24048 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24049
24050 if (Invert)
24051 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24052
24053 return DAG.getBitcast(VT, Result);
24054 }
24055
24056 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24057 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24058 // pcmpeqd + pshufd + pand.
24059 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24060
24061 // First cast everything to the right type.
24062 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24063 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24064
24065 // Do the compare.
24066 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24067
24068 // Make sure the lower and upper halves are both all-ones.
24069 static const int Mask[] = { 1, 0, 3, 2 };
24070 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24071 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24072
24073 if (Invert)
24074 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24075
24076 return DAG.getBitcast(VT, Result);
24077 }
24078 }
24079
24080 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24081 // bits of the inputs before performing those operations.
24082 if (FlipSigns) {
24083 MVT EltVT = VT.getVectorElementType();
24085 VT);
24086 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24087 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24088 }
24089
24090 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24091
24092 // If the logical-not of the result is required, perform that now.
24093 if (Invert)
24094 Result = DAG.getNOT(dl, Result, VT);
24095
24096 return Result;
24097}
24098
24099// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24101 const SDLoc &dl, SelectionDAG &DAG,
24102 const X86Subtarget &Subtarget,
24103 SDValue &X86CC) {
24104 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24105
24106 // Must be a bitcast from vXi1.
24107 if (Op0.getOpcode() != ISD::BITCAST)
24108 return SDValue();
24109
24110 Op0 = Op0.getOperand(0);
24111 MVT VT = Op0.getSimpleValueType();
24112 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24113 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24114 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24115 return SDValue();
24116
24117 X86::CondCode X86Cond;
24118 if (isNullConstant(Op1)) {
24119 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24120 } else if (isAllOnesConstant(Op1)) {
24121 // C flag is set for all ones.
24122 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24123 } else
24124 return SDValue();
24125
24126 // If the input is an AND, we can combine it's operands into the KTEST.
24127 bool KTestable = false;
24128 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24129 KTestable = true;
24130 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24131 KTestable = true;
24132 if (!isNullConstant(Op1))
24133 KTestable = false;
24134 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24135 SDValue LHS = Op0.getOperand(0);
24136 SDValue RHS = Op0.getOperand(1);
24137 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24138 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24139 }
24140
24141 // If the input is an OR, we can combine it's operands into the KORTEST.
24142 SDValue LHS = Op0;
24143 SDValue RHS = Op0;
24144 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24145 LHS = Op0.getOperand(0);
24146 RHS = Op0.getOperand(1);
24147 }
24148
24149 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24150 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24151}
24152
24153/// Emit flags for the given setcc condition and operands. Also returns the
24154/// corresponding X86 condition code constant in X86CC.
24155SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24156 ISD::CondCode CC, const SDLoc &dl,
24157 SelectionDAG &DAG,
24158 SDValue &X86CC) const {
24159 // Equality Combines.
24160 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24161 X86::CondCode X86CondCode;
24162
24163 // Optimize to BT if possible.
24164 // Lower (X & (1 << N)) == 0 to BT(X, N).
24165 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24166 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24167 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24168 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24169 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24170 return BT;
24171 }
24172 }
24173
24174 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24175 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24176 X86CondCode)) {
24177 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24178 return CmpZ;
24179 }
24180
24181 // Try to lower using KORTEST or KTEST.
24182 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24183 return Test;
24184
24185 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24186 // of these.
24187 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24188 // If the input is a setcc, then reuse the input setcc or use a new one
24189 // with the inverted condition.
24190 if (Op0.getOpcode() == X86ISD::SETCC) {
24191 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24192
24193 X86CC = Op0.getOperand(0);
24194 if (Invert) {
24195 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24196 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24197 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24198 }
24199
24200 return Op0.getOperand(1);
24201 }
24202 }
24203
24204 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24205 // overflow.
24206 if (isMinSignedConstant(Op1)) {
24207 EVT VT = Op0.getValueType();
24208 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24209 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24211 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24212 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24213 DAG.getConstant(0, dl, VT), Op0);
24214 return SDValue(Neg.getNode(), 1);
24215 }
24216 }
24217
24218 // Try to use the carry flag from the add in place of an separate CMP for:
24219 // (seteq (add X, -1), -1). Similar for setne.
24220 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24221 Op0.getOperand(1) == Op1) {
24222 if (isProfitableToUseFlagOp(Op0)) {
24223 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24224
24225 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24226 Op0.getOperand(1));
24227 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24228 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24229 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24230 return SDValue(New.getNode(), 1);
24231 }
24232 }
24233 }
24234
24236 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24237 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24238
24239 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24240 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24241 return EFLAGS;
24242}
24243
24244SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24245
24246 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24247 Op.getOpcode() == ISD::STRICT_FSETCCS;
24248 MVT VT = Op->getSimpleValueType(0);
24249
24250 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24251
24252 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24253 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24254 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24255 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24256 SDLoc dl(Op);
24258 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24259
24260 if (isSoftF16(Op0.getValueType(), Subtarget))
24261 return SDValue();
24262
24263 // Handle f128 first, since one possible outcome is a normal integer
24264 // comparison which gets handled by emitFlagsForSetcc.
24265 if (Op0.getValueType() == MVT::f128) {
24266 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24267 Op.getOpcode() == ISD::STRICT_FSETCCS);
24268
24269 // If softenSetCCOperands returned a scalar, use it.
24270 if (!Op1.getNode()) {
24271 assert(Op0.getValueType() == Op.getValueType() &&
24272 "Unexpected setcc expansion!");
24273 if (IsStrict)
24274 return DAG.getMergeValues({Op0, Chain}, dl);
24275 return Op0;
24276 }
24277 }
24278
24279 if (Op0.getSimpleValueType().isInteger()) {
24280 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24281 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24282 // this may translate to less uops depending on uarch implementation. The
24283 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24284 // canonicalize to that CondCode.
24285 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24286 // encoding size - so it must either already be a i8 or i32 immediate, or it
24287 // shrinks down to that. We don't do this for any i64's to avoid additional
24288 // constant materializations.
24289 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24290 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24291 const APInt &Op1Val = Op1C->getAPIntValue();
24292 if (!Op1Val.isZero()) {
24293 // Ensure the constant+1 doesn't overflow.
24294 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24295 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24296 APInt Op1ValPlusOne = Op1Val + 1;
24297 if (Op1ValPlusOne.isSignedIntN(32) &&
24298 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24299 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24302 }
24303 }
24304 }
24305 }
24306
24307 SDValue X86CC;
24308 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24309 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24310 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24311 }
24312
24313 if (Subtarget.hasAVX10_2()) {
24314 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24315 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24316 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24317 if (Op0.getSimpleValueType() != MVT::f80)
24318 return getSETCC(
24319 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24320 }
24321 }
24322 // Handle floating point.
24323 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24324 if (CondCode == X86::COND_INVALID)
24325 return SDValue();
24326
24327 SDValue EFLAGS;
24328 if (IsStrict) {
24329 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24330 EFLAGS =
24332 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24333 Chain = EFLAGS.getValue(1);
24334 } else {
24335 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24336 }
24337
24338 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24339 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24340 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24341}
24342
24343SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24344 SDValue LHS = Op.getOperand(0);
24345 SDValue RHS = Op.getOperand(1);
24346 SDValue Carry = Op.getOperand(2);
24347 SDValue Cond = Op.getOperand(3);
24348 SDLoc DL(Op);
24349
24350 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24351 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24352
24353 // Recreate the carry if needed.
24354 EVT CarryVT = Carry.getValueType();
24355 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24356 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24357
24358 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24359 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24360 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24361}
24362
24363// This function returns three things: the arithmetic computation itself
24364// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24365// flag and the condition code define the case in which the arithmetic
24366// computation overflows.
24367static std::pair<SDValue, SDValue>
24369 assert(Op.getResNo() == 0 && "Unexpected result number!");
24370 SDValue Value, Overflow;
24371 SDValue LHS = Op.getOperand(0);
24372 SDValue RHS = Op.getOperand(1);
24373 unsigned BaseOp = 0;
24374 SDLoc DL(Op);
24375 switch (Op.getOpcode()) {
24376 default: llvm_unreachable("Unknown ovf instruction!");
24377 case ISD::SADDO:
24378 BaseOp = X86ISD::ADD;
24379 Cond = X86::COND_O;
24380 break;
24381 case ISD::UADDO:
24382 BaseOp = X86ISD::ADD;
24384 break;
24385 case ISD::SSUBO:
24386 BaseOp = X86ISD::SUB;
24387 Cond = X86::COND_O;
24388 break;
24389 case ISD::USUBO:
24390 BaseOp = X86ISD::SUB;
24391 Cond = X86::COND_B;
24392 break;
24393 case ISD::SMULO:
24394 BaseOp = X86ISD::SMUL;
24395 Cond = X86::COND_O;
24396 break;
24397 case ISD::UMULO:
24398 BaseOp = X86ISD::UMUL;
24399 Cond = X86::COND_O;
24400 break;
24401 }
24402
24403 if (BaseOp) {
24404 // Also sets EFLAGS.
24405 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24406 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24407 Overflow = Value.getValue(1);
24408 }
24409
24410 return std::make_pair(Value, Overflow);
24411}
24412
24414 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24415 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24416 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24417 // has only one use.
24418 SDLoc DL(Op);
24420 SDValue Value, Overflow;
24421 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24422
24423 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24424 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24425 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24426}
24427
24428/// Return true if opcode is a X86 logical comparison.
24430 unsigned Opc = Op.getOpcode();
24431 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24432 Opc == X86ISD::FCMP)
24433 return true;
24434 if (Op.getResNo() == 1 &&
24435 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24436 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24437 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24438 return true;
24439
24440 return false;
24441}
24442
24444 if (V.getOpcode() != ISD::TRUNCATE)
24445 return false;
24446
24447 SDValue VOp0 = V.getOperand(0);
24448 unsigned InBits = VOp0.getValueSizeInBits();
24449 unsigned Bits = V.getValueSizeInBits();
24450 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24451}
24452
24453// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24455 unsigned X86CC, const SDLoc &DL,
24456 SelectionDAG &DAG,
24457 const X86Subtarget &Subtarget) {
24458 EVT CmpVT = CmpVal.getValueType();
24459 EVT VT = LHS.getValueType();
24460 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24461 return SDValue();
24462
24463 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24464 isOneConstant(CmpVal.getOperand(1))) {
24465 auto SplatLSB = [&](EVT SplatVT) {
24466 // we need mask of all zeros or ones with same size of the other
24467 // operands.
24468 SDValue Neg = CmpVal;
24469 if (CmpVT.bitsGT(SplatVT))
24470 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24471 else if (CmpVT.bitsLT(SplatVT))
24472 Neg = DAG.getNode(
24473 ISD::AND, DL, SplatVT,
24474 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24475 DAG.getConstant(1, DL, SplatVT));
24476 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24477 };
24478
24479 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24481 return SplatLSB(VT);
24482
24483 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24484 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24485 isa<ConstantSDNode>(RHS)) {
24486 SDValue Mask = SplatLSB(VT);
24487 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24488 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24489 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24490 }
24491
24492 SDValue Src1, Src2;
24493 auto isIdentityPatternZero = [&]() {
24494 switch (RHS.getOpcode()) {
24495 default:
24496 break;
24497 case ISD::OR:
24498 case ISD::XOR:
24499 case ISD::ADD:
24500 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24501 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24502 Src2 = LHS;
24503 return true;
24504 }
24505 break;
24506 case ISD::SHL:
24507 case ISD::SRA:
24508 case ISD::SRL:
24509 case ISD::SUB:
24510 if (RHS.getOperand(0) == LHS) {
24511 Src1 = RHS.getOperand(1);
24512 Src2 = LHS;
24513 return true;
24514 }
24515 break;
24516 }
24517 return false;
24518 };
24519
24520 auto isIdentityPatternOnes = [&]() {
24521 switch (LHS.getOpcode()) {
24522 default:
24523 break;
24524 case ISD::AND:
24525 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24526 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24527 Src2 = RHS;
24528 return true;
24529 }
24530 break;
24531 }
24532 return false;
24533 };
24534
24535 // Convert 'identity' patterns (iff X is 0 or 1):
24536 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24537 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24538 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24539 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24540 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24541 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24542 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24543 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24544 SDValue Mask = SplatLSB(Src1.getValueType());
24545 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24546 Src1); // Mask & z
24547 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24548 }
24549 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24550 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24551 SDValue Mask = SplatLSB(VT);
24552 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24553 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24554 }
24555 }
24556
24557 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24560 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24561
24562 // 'X - 1' sets the carry flag if X == 0.
24563 // '0 - X' sets the carry flag if X != 0.
24564 // Convert the carry flag to a -1/0 mask with sbb:
24565 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24566 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24567 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24568 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24569 SDValue Sub;
24570 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24571 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24572 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24573 } else {
24574 SDValue One = DAG.getConstant(1, DL, CmpVT);
24575 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24576 }
24577 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24578 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24579 Sub.getValue(1));
24580 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24581 }
24582
24583 return SDValue();
24584}
24585
24586SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24587 bool AddTest = true;
24588 SDValue Cond = Op.getOperand(0);
24589 SDValue Op1 = Op.getOperand(1);
24590 SDValue Op2 = Op.getOperand(2);
24591 SDLoc DL(Op);
24592 MVT VT = Op1.getSimpleValueType();
24593 SDValue CC;
24594
24595 if (isSoftF16(VT, Subtarget)) {
24596 MVT NVT = VT.changeTypeToInteger();
24597 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24598 DAG.getBitcast(NVT, Op1),
24599 DAG.getBitcast(NVT, Op2)));
24600 }
24601
24602 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24603 // are available or VBLENDV if AVX is available.
24604 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24605 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24606 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24607 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24608 bool IsAlwaysSignaling;
24609 unsigned SSECC =
24610 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24611 CondOp0, CondOp1, IsAlwaysSignaling);
24612
24613 if (Subtarget.hasAVX512()) {
24614 SDValue Cmp =
24615 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24616 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24617 assert(!VT.isVector() && "Not a scalar type?");
24618 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24619 }
24620
24621 if (SSECC < 8 || Subtarget.hasAVX()) {
24622 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24623 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24624
24625 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24626 // of 3 logic instructions for size savings and potentially speed.
24627 // Unfortunately, there is no scalar form of VBLENDV.
24628
24629 // If either operand is a +0.0 constant, don't try this. We can expect to
24630 // optimize away at least one of the logic instructions later in that
24631 // case, so that sequence would be faster than a variable blend.
24632
24633 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24634 // uses XMM0 as the selection register. That may need just as many
24635 // instructions as the AND/ANDN/OR sequence due to register moves, so
24636 // don't bother.
24637 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24638 !isNullFPConstant(Op2)) {
24639 // Convert to vectors, do a VSELECT, and convert back to scalar.
24640 // All of the conversions should be optimized away.
24641 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24642 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24643 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24644 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24645
24646 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24647 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24648
24649 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24650
24651 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
24652 DAG.getVectorIdxConstant(0, DL));
24653 }
24654 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24655 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24656 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24657 }
24658 }
24659
24660 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24661 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24662 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24663 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24664 }
24665
24666 if (Cond.getOpcode() == ISD::SETCC &&
24667 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24668 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24669 Cond = NewCond;
24670 // If the condition was updated, it's possible that the operands of the
24671 // select were also updated (for example, EmitTest has a RAUW). Refresh
24672 // the local references to the select operands in case they got stale.
24673 Op1 = Op.getOperand(1);
24674 Op2 = Op.getOperand(2);
24675 }
24676 }
24677
24678 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24679 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24680 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24681 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24682 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24683 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24684 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24685 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24686 if (Cond.getOpcode() == X86ISD::SETCC &&
24687 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24688 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24689 SDValue Cmp = Cond.getOperand(1);
24690 SDValue CmpOp0 = Cmp.getOperand(0);
24691 unsigned CondCode = Cond.getConstantOperandVal(0);
24692
24693 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24694 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24695 // handle to keep the CMP with 0. This should be removed by
24696 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24697 // cttz_zero_undef.
24698 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24699 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24700 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24701 };
24702 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24703 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24704 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24705 // Keep Cmp.
24706 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
24707 DL, DAG, Subtarget)) {
24708 return R;
24709 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24710 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24711 ((CondCode == X86::COND_S) || // smin(x, 0)
24712 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24713 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24714 //
24715 // If the comparison is testing for a positive value, we have to invert
24716 // the sign bit mask, so only do that transform if the target has a
24717 // bitwise 'and not' instruction (the invert is free).
24718 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24719 unsigned ShCt = VT.getSizeInBits() - 1;
24720 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24721 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24722 if (CondCode == X86::COND_G)
24723 Shift = DAG.getNOT(DL, Shift, VT);
24724 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24725 }
24726 }
24727
24728 // Look past (and (setcc_carry (cmp ...)), 1).
24729 if (Cond.getOpcode() == ISD::AND &&
24730 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24731 isOneConstant(Cond.getOperand(1)))
24732 Cond = Cond.getOperand(0);
24733
24734 // Attempt to fold "raw cond" cases by treating them as:
24735 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
24736 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
24737 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
24738 Subtarget))
24739 return R;
24740
24741 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24742 // setting operand in place of the X86ISD::SETCC.
24743 unsigned CondOpcode = Cond.getOpcode();
24744 if (CondOpcode == X86ISD::SETCC ||
24745 CondOpcode == X86ISD::SETCC_CARRY) {
24746 CC = Cond.getOperand(0);
24747
24748 SDValue Cmp = Cond.getOperand(1);
24749 bool IllegalFPCMov = false;
24750 if (VT.isFloatingPoint() && !VT.isVector() &&
24751 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24752 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24753
24754 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24755 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24756 Cond = Cmp;
24757 AddTest = false;
24758 }
24759 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24760 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24761 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24762 SDValue Value;
24763 X86::CondCode X86Cond;
24764 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24765
24766 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24767 AddTest = false;
24768 }
24769
24770 if (AddTest) {
24771 // Look past the truncate if the high bits are known zero.
24773 Cond = Cond.getOperand(0);
24774
24775 // We know the result of AND is compared against zero. Try to match
24776 // it to BT.
24777 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24778 X86::CondCode X86CondCode;
24779 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24780 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24781 Cond = BT;
24782 AddTest = false;
24783 }
24784 }
24785 }
24786
24787 if (AddTest) {
24788 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24789 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24790 }
24791
24792 // a < b ? -1 : 0 -> RES = ~setcc_carry
24793 // a < b ? 0 : -1 -> RES = setcc_carry
24794 // a >= b ? -1 : 0 -> RES = setcc_carry
24795 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24796 if (Cond.getOpcode() == X86ISD::SUB) {
24797 unsigned CondCode = CC->getAsZExtVal();
24798
24799 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24800 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24801 (isNullConstant(Op1) || isNullConstant(Op2))) {
24802 SDValue Res =
24803 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24804 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24805 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24806 return DAG.getNOT(DL, Res, Res.getValueType());
24807 return Res;
24808 }
24809 }
24810
24811 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24812 // widen the cmov and push the truncate through. This avoids introducing a new
24813 // branch during isel and doesn't add any extensions.
24814 if (Op.getValueType() == MVT::i8 &&
24815 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24816 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24817 if (T1.getValueType() == T2.getValueType() &&
24818 // Exclude CopyFromReg to avoid partial register stalls.
24819 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24820 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24821 CC, Cond);
24822 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24823 }
24824 }
24825
24826 // Or finally, promote i8 cmovs if we have CMOV,
24827 // or i16 cmovs if it won't prevent folding a load.
24828 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24829 // legal, but EmitLoweredSelect() can not deal with these extensions
24830 // being inserted between two CMOV's. (in i16 case too TBN)
24831 // https://bugs.llvm.org/show_bug.cgi?id=40974
24832 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24833 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24834 !X86::mayFoldLoad(Op2, Subtarget))) {
24835 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24836 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24837 SDValue Ops[] = { Op2, Op1, CC, Cond };
24838 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24839 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24840 }
24841
24842 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24843 // condition is true.
24844 SDValue Ops[] = { Op2, Op1, CC, Cond };
24845 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24846}
24847
24849 const X86Subtarget &Subtarget,
24850 SelectionDAG &DAG) {
24851 MVT VT = Op->getSimpleValueType(0);
24852 SDValue In = Op->getOperand(0);
24853 MVT InVT = In.getSimpleValueType();
24854 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24855 MVT VTElt = VT.getVectorElementType();
24856 unsigned NumElts = VT.getVectorNumElements();
24857
24858 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24859 MVT ExtVT = VT;
24860 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24861 // If v16i32 is to be avoided, we'll need to split and concatenate.
24862 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24863 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24864
24865 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24866 }
24867
24868 // Widen to 512-bits if VLX is not supported.
24869 MVT WideVT = ExtVT;
24870 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24871 NumElts *= 512 / ExtVT.getSizeInBits();
24872 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24873 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
24874 DAG.getVectorIdxConstant(0, dl));
24875 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24876 }
24877
24878 SDValue V;
24879 MVT WideEltVT = WideVT.getVectorElementType();
24880 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24881 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24882 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24883 } else {
24884 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
24885 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24886 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24887 }
24888
24889 // Truncate if we had to extend i16/i8 above.
24890 if (VT != ExtVT) {
24891 WideVT = MVT::getVectorVT(VTElt, NumElts);
24892 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24893 }
24894
24895 // Extract back to 128/256-bit if we widened.
24896 if (WideVT != VT)
24897 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24898 DAG.getVectorIdxConstant(0, dl));
24899
24900 return V;
24901}
24902
24904 SelectionDAG &DAG) {
24905 SDValue In = Op->getOperand(0);
24906 MVT InVT = In.getSimpleValueType();
24907 SDLoc DL(Op);
24908
24909 if (InVT.getVectorElementType() == MVT::i1)
24910 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
24911
24912 assert(Subtarget.hasAVX() && "Expected AVX support");
24913 return LowerAVXExtend(Op, DL, DAG, Subtarget);
24914}
24915
24916// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24917// For sign extend this needs to handle all vector sizes and SSE4.1 and
24918// non-SSE4.1 targets. For zero extend this should only handle inputs of
24919// MVT::v64i8 when BWI is not supported, but AVX512 is.
24921 const X86Subtarget &Subtarget,
24922 SelectionDAG &DAG) {
24923 SDValue In = Op->getOperand(0);
24924 MVT VT = Op->getSimpleValueType(0);
24925 MVT InVT = In.getSimpleValueType();
24926
24927 MVT SVT = VT.getVectorElementType();
24928 MVT InSVT = InVT.getVectorElementType();
24930
24931 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24932 return SDValue();
24933 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24934 return SDValue();
24935 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24936 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24937 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24938 return SDValue();
24939
24940 SDLoc dl(Op);
24941 unsigned Opc = Op.getOpcode();
24942 unsigned NumElts = VT.getVectorNumElements();
24943
24944 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24945 // For 512-bit vectors, we need 128-bits or 256-bits.
24946 if (InVT.getSizeInBits() > 128) {
24947 // Input needs to be at least the same number of elements as output, and
24948 // at least 128-bits.
24949 int InSize = InSVT.getSizeInBits() * NumElts;
24950 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24951 InVT = In.getSimpleValueType();
24952 }
24953
24954 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24955 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24956 // need to be handled here for 256/512-bit results.
24957 if (Subtarget.hasInt256()) {
24958 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24959
24960 if (InVT.getVectorNumElements() != NumElts)
24961 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24962
24963 // FIXME: Apparently we create inreg operations that could be regular
24964 // extends.
24965 unsigned ExtOpc =
24968 return DAG.getNode(ExtOpc, dl, VT, In);
24969 }
24970
24971 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24972 if (Subtarget.hasAVX()) {
24973 assert(VT.is256BitVector() && "256-bit vector expected");
24974 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24975 int HalfNumElts = HalfVT.getVectorNumElements();
24976
24977 unsigned NumSrcElts = InVT.getVectorNumElements();
24978 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24979 for (int i = 0; i != HalfNumElts; ++i)
24980 HiMask[i] = HalfNumElts + i;
24981
24982 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24983 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24984 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24985 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24986 }
24987
24988 // We should only get here for sign extend.
24989 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24990 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24991 unsigned InNumElts = InVT.getVectorNumElements();
24992
24993 // If the source elements are already all-signbits, we don't need to extend,
24994 // just splat the elements.
24995 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24996 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24997 unsigned Scale = InNumElts / NumElts;
24998 SmallVector<int, 16> ShuffleMask;
24999 for (unsigned I = 0; I != NumElts; ++I)
25000 ShuffleMask.append(Scale, I);
25001 return DAG.getBitcast(VT,
25002 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25003 }
25004
25005 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25006 SDValue Curr = In;
25007 SDValue SignExt = Curr;
25008
25009 // As SRAI is only available on i16/i32 types, we expand only up to i32
25010 // and handle i64 separately.
25011 if (InVT != MVT::v4i32) {
25012 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25013
25014 unsigned DestWidth = DestVT.getScalarSizeInBits();
25015 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25016 unsigned DestElts = DestVT.getVectorNumElements();
25017
25018 // Build a shuffle mask that takes each input element and places it in the
25019 // MSBs of the new element size.
25020 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25021 for (unsigned i = 0; i != DestElts; ++i)
25022 Mask[i * Scale + (Scale - 1)] = i;
25023
25024 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25025 Curr = DAG.getBitcast(DestVT, Curr);
25026
25027 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25028 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25029 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25030 }
25031
25032 if (VT == MVT::v2i64) {
25033 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25034 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25035 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25036 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25037 SignExt = DAG.getBitcast(VT, SignExt);
25038 }
25039
25040 return SignExt;
25041}
25042
25044 SelectionDAG &DAG) {
25045 MVT VT = Op->getSimpleValueType(0);
25046 SDValue In = Op->getOperand(0);
25047 MVT InVT = In.getSimpleValueType();
25048 SDLoc dl(Op);
25049
25050 if (InVT.getVectorElementType() == MVT::i1)
25051 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25052
25053 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25055 "Expected same number of elements");
25056 assert((VT.getVectorElementType() == MVT::i16 ||
25057 VT.getVectorElementType() == MVT::i32 ||
25058 VT.getVectorElementType() == MVT::i64) &&
25059 "Unexpected element type");
25060 assert((InVT.getVectorElementType() == MVT::i8 ||
25061 InVT.getVectorElementType() == MVT::i16 ||
25062 InVT.getVectorElementType() == MVT::i32) &&
25063 "Unexpected element type");
25064
25065 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25066 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25067 return splitVectorIntUnary(Op, DAG, dl);
25068 }
25069
25070 if (Subtarget.hasInt256())
25071 return Op;
25072
25073 // Optimize vectors in AVX mode
25074 // Sign extend v8i16 to v8i32 and
25075 // v4i32 to v4i64
25076 //
25077 // Divide input vector into two parts
25078 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25079 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25080 // concat the vectors to original VT
25081 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25082 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25083
25084 unsigned NumElems = InVT.getVectorNumElements();
25085 SmallVector<int,8> ShufMask(NumElems, -1);
25086 for (unsigned i = 0; i != NumElems/2; ++i)
25087 ShufMask[i] = i + NumElems/2;
25088
25089 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25090 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25091
25092 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25093}
25094
25095/// Change a vector store into a pair of half-size vector stores.
25097 SDValue StoredVal = Store->getValue();
25098 assert((StoredVal.getValueType().is256BitVector() ||
25099 StoredVal.getValueType().is512BitVector()) &&
25100 "Expecting 256/512-bit op");
25101
25102 // Splitting volatile memory ops is not allowed unless the operation was not
25103 // legal to begin with. Assume the input store is legal (this transform is
25104 // only used for targets with AVX). Note: It is possible that we have an
25105 // illegal type like v2i128, and so we could allow splitting a volatile store
25106 // in that case if that is important.
25107 if (!Store->isSimple())
25108 return SDValue();
25109
25110 SDLoc DL(Store);
25111 SDValue Value0, Value1;
25112 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25113 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25114 SDValue Ptr0 = Store->getBasePtr();
25115 SDValue Ptr1 =
25116 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25117 SDValue Ch0 =
25118 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25119 Store->getOriginalAlign(),
25120 Store->getMemOperand()->getFlags());
25121 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25122 Store->getPointerInfo().getWithOffset(HalfOffset),
25123 Store->getOriginalAlign(),
25124 Store->getMemOperand()->getFlags());
25125 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25126}
25127
25128/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25129/// type.
25131 SelectionDAG &DAG) {
25132 SDValue StoredVal = Store->getValue();
25133 assert(StoreVT.is128BitVector() &&
25134 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25135 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25136
25137 // Splitting volatile memory ops is not allowed unless the operation was not
25138 // legal to begin with. We are assuming the input op is legal (this transform
25139 // is only used for targets with AVX).
25140 if (!Store->isSimple())
25141 return SDValue();
25142
25143 MVT StoreSVT = StoreVT.getScalarType();
25144 unsigned NumElems = StoreVT.getVectorNumElements();
25145 unsigned ScalarSize = StoreSVT.getStoreSize();
25146
25147 SDLoc DL(Store);
25149 for (unsigned i = 0; i != NumElems; ++i) {
25150 unsigned Offset = i * ScalarSize;
25151 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25153 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25154 DAG.getVectorIdxConstant(i, DL));
25155 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25156 Store->getPointerInfo().getWithOffset(Offset),
25157 Store->getOriginalAlign(),
25158 Store->getMemOperand()->getFlags());
25159 Stores.push_back(Ch);
25160 }
25161 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25162}
25163
25164static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25165 SelectionDAG &DAG) {
25166 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25167 SDLoc dl(St);
25168 SDValue StoredVal = St->getValue();
25169
25170 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25171 if (StoredVal.getValueType().isVector() &&
25172 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25173 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25174 assert(NumElts <= 8 && "Unexpected VT");
25175 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25176 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25177 "Expected AVX512F without AVX512DQI");
25178
25179 // We must pad with zeros to ensure we store zeroes to any unused bits.
25180 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25181 DAG.getUNDEF(MVT::v16i1), StoredVal,
25182 DAG.getVectorIdxConstant(0, dl));
25183 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25184 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25185 // Make sure we store zeros in the extra bits.
25186 if (NumElts < 8)
25187 StoredVal = DAG.getZeroExtendInReg(
25188 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25189
25190 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25191 St->getPointerInfo(), St->getOriginalAlign(),
25192 St->getMemOperand()->getFlags());
25193 }
25194
25195 if (St->isTruncatingStore())
25196 return SDValue();
25197
25198 // If this is a 256-bit store of concatenated ops, we are better off splitting
25199 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
25200 // and each half can execute independently. Some cores would split the op into
25201 // halves anyway, so the concat (vinsertf128) is purely an extra op.
25202 MVT StoreVT = StoredVal.getSimpleValueType();
25203 if (StoreVT.is256BitVector() ||
25204 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
25205 !Subtarget.hasBWI())) {
25206 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
25207 return splitVectorStore(St, DAG);
25208 return SDValue();
25209 }
25210
25211 if (StoreVT.is32BitVector())
25212 return SDValue();
25213
25214 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25215 assert(StoreVT.is64BitVector() && "Unexpected VT");
25216 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25218 "Unexpected type action!");
25219
25220 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25221 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25222 DAG.getUNDEF(StoreVT));
25223
25224 if (Subtarget.hasSSE2()) {
25225 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25226 // and store it.
25227 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25228 MVT CastVT = MVT::getVectorVT(StVT, 2);
25229 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25230 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25231 DAG.getVectorIdxConstant(0, dl));
25232
25233 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25234 St->getPointerInfo(), St->getOriginalAlign(),
25235 St->getMemOperand()->getFlags());
25236 }
25237 assert(Subtarget.hasSSE1() && "Expected SSE");
25238 SDVTList Tys = DAG.getVTList(MVT::Other);
25239 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25240 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25241 St->getMemOperand());
25242}
25243
25244// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25245// may emit an illegal shuffle but the expansion is still better than scalar
25246// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25247// we'll emit a shuffle and a arithmetic shift.
25248// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25249// TODO: It is possible to support ZExt by zeroing the undef values during
25250// the shuffle phase or after the shuffle.
25251static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25252 SelectionDAG &DAG) {
25253 MVT RegVT = Op.getSimpleValueType();
25254 assert(RegVT.isVector() && "We only custom lower vector loads.");
25255 assert(RegVT.isInteger() &&
25256 "We only custom lower integer vector loads.");
25257
25258 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25259 SDLoc dl(Ld);
25260
25261 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25262 if (RegVT.getVectorElementType() == MVT::i1) {
25263 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25264 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25265 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25266 "Expected AVX512F without AVX512DQI");
25267
25268 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25269 Ld->getPointerInfo(), Ld->getOriginalAlign(),
25270 Ld->getMemOperand()->getFlags());
25271
25272 // Replace chain users with the new chain.
25273 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25274
25275 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25276 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25277 DAG.getBitcast(MVT::v16i1, Val),
25278 DAG.getVectorIdxConstant(0, dl));
25279 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25280 }
25281
25282 return SDValue();
25283}
25284
25285/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25286/// each of which has no other use apart from the AND / OR.
25287static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25288 Opc = Op.getOpcode();
25289 if (Opc != ISD::OR && Opc != ISD::AND)
25290 return false;
25291 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25292 Op.getOperand(0).hasOneUse() &&
25293 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25294 Op.getOperand(1).hasOneUse());
25295}
25296
25297SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25298 SDValue Chain = Op.getOperand(0);
25299 SDValue Cond = Op.getOperand(1);
25300 SDValue Dest = Op.getOperand(2);
25301 SDLoc dl(Op);
25302
25303 // Bail out when we don't have native compare instructions.
25304 if (Cond.getOpcode() == ISD::SETCC &&
25305 Cond.getOperand(0).getValueType() != MVT::f128 &&
25306 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25307 SDValue LHS = Cond.getOperand(0);
25308 SDValue RHS = Cond.getOperand(1);
25309 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25310
25311 // Special case for
25312 // setcc([su]{add,sub,mul}o == 0)
25313 // setcc([su]{add,sub,mul}o != 1)
25314 if (ISD::isOverflowIntrOpRes(LHS) &&
25315 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25316 (isNullConstant(RHS) || isOneConstant(RHS))) {
25317 SDValue Value, Overflow;
25318 X86::CondCode X86Cond;
25319 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25320
25321 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25322 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25323
25324 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25325 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25326 Overflow, Op->getFlags());
25327 }
25328
25329 if (LHS.getSimpleValueType().isInteger()) {
25330 SDValue CCVal;
25331 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25332 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25333 EFLAGS, Op->getFlags());
25334 }
25335
25336 if (CC == ISD::SETOEQ) {
25337 // For FCMP_OEQ, we can emit
25338 // two branches instead of an explicit AND instruction with a
25339 // separate test. However, we only do this if this block doesn't
25340 // have a fall-through edge, because this requires an explicit
25341 // jmp when the condition is false.
25342 if (Op.getNode()->hasOneUse()) {
25343 SDNode *User = *Op.getNode()->user_begin();
25344 // Look for an unconditional branch following this conditional branch.
25345 // We need this because we need to reverse the successors in order
25346 // to implement FCMP_OEQ.
25347 if (User->getOpcode() == ISD::BR) {
25348 SDValue FalseBB = User->getOperand(1);
25349 SDNode *NewBR =
25350 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25351 assert(NewBR == User);
25352 (void)NewBR;
25353 Dest = FalseBB;
25354
25355 SDValue Cmp =
25356 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25357 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25358 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25359 CCVal, Cmp, Op->getFlags());
25360 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25361 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25362 Cmp, Op->getFlags());
25363 }
25364 }
25365 } else if (CC == ISD::SETUNE) {
25366 // For FCMP_UNE, we can emit
25367 // two branches instead of an explicit OR instruction with a
25368 // separate test.
25369 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25370 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25371 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25372 Cmp, Op->getFlags());
25373 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25374 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25375 Cmp, Op->getFlags());
25376 } else {
25377 X86::CondCode X86Cond =
25378 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25379 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25380 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25381 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25382 Cmp, Op->getFlags());
25383 }
25384 }
25385
25387 SDValue Value, Overflow;
25388 X86::CondCode X86Cond;
25389 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25390
25391 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25392 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25393 Overflow, Op->getFlags());
25394 }
25395
25396 // Look past the truncate if the high bits are known zero.
25398 Cond = Cond.getOperand(0);
25399
25400 EVT CondVT = Cond.getValueType();
25401
25402 // Add an AND with 1 if we don't already have one.
25403 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25404 Cond =
25405 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25406
25407 SDValue LHS = Cond;
25408 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25409
25410 SDValue CCVal;
25411 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25412 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25413 Op->getFlags());
25414}
25415
25416// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25417// Calls to _alloca are needed to probe the stack when allocating more than 4k
25418// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25419// that the guard pages used by the OS virtual memory manager are allocated in
25420// correct sequence.
25421SDValue
25422X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25423 SelectionDAG &DAG) const {
25425 bool SplitStack = MF.shouldSplitStack();
25426 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25427 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25428 SplitStack || EmitStackProbeCall;
25429 SDLoc dl(Op);
25430
25431 // Get the inputs.
25432 SDNode *Node = Op.getNode();
25433 SDValue Chain = Op.getOperand(0);
25434 SDValue Size = Op.getOperand(1);
25435 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25436 EVT VT = Node->getValueType(0);
25437
25438 // Chain the dynamic stack allocation so that it doesn't modify the stack
25439 // pointer when other instructions are using the stack.
25440 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25441
25442 bool Is64Bit = Subtarget.is64Bit();
25443 MVT SPTy = getPointerTy(DAG.getDataLayout());
25444
25446 if (!Lower) {
25447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25449 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25450 " not tell us which reg is the stack pointer!");
25451
25452 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25453 const Align StackAlign = TFI.getStackAlign();
25454 if (hasInlineStackProbe(MF)) {
25455 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25456 {Chain, Size});
25457 Chain = Result.getValue(1);
25458 } else {
25459 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25460 Chain = SP.getValue(1);
25461 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25462 }
25463 if (Alignment && *Alignment > StackAlign)
25464 Result = DAG.getNode(
25465 ISD::AND, dl, VT, Result,
25466 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25467 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25468 } else if (SplitStack) {
25469 if (Is64Bit) {
25470 // The 64 bit implementation of segmented stacks needs to clobber both r10
25471 // r11. This makes it impossible to use it along with nested parameters.
25472 const Function &F = MF.getFunction();
25473 for (const auto &A : F.args()) {
25474 if (A.hasNestAttr())
25475 report_fatal_error("Cannot use segmented stacks with functions that "
25476 "have nested arguments.");
25477 }
25478 }
25479
25480 Result =
25481 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25482 Chain = Result.getValue(1);
25483 } else {
25484 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25485 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25486 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25487
25488 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25489 Register SPReg = RegInfo->getStackRegister();
25490 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25491 Chain = SP.getValue(1);
25492
25493 if (Alignment) {
25494 SP = DAG.getNode(
25495 ISD::AND, dl, VT, SP.getValue(0),
25496 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25497 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25498 }
25499
25500 Result = SP;
25501 }
25502
25503 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25504
25505 SDValue Ops[2] = {Result, Chain};
25506 return DAG.getMergeValues(Ops, dl);
25507}
25508
25509SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25511 auto PtrVT = getPointerTy(MF.getDataLayout());
25513
25514 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25515 SDLoc DL(Op);
25516
25517 if (!Subtarget.is64Bit() ||
25518 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25519 // vastart just stores the address of the VarArgsFrameIndex slot into the
25520 // memory location argument.
25521 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25522 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25523 MachinePointerInfo(SV));
25524 }
25525
25526 // __va_list_tag:
25527 // gp_offset (0 - 6 * 8)
25528 // fp_offset (48 - 48 + 8 * 16)
25529 // overflow_arg_area (point to parameters coming in memory).
25530 // reg_save_area
25532 SDValue FIN = Op.getOperand(1);
25533 // Store gp_offset
25534 SDValue Store = DAG.getStore(
25535 Op.getOperand(0), DL,
25536 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25537 MachinePointerInfo(SV));
25538 MemOps.push_back(Store);
25539
25540 // Store fp_offset
25541 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25542 Store = DAG.getStore(
25543 Op.getOperand(0), DL,
25544 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25545 MachinePointerInfo(SV, 4));
25546 MemOps.push_back(Store);
25547
25548 // Store ptr to overflow_arg_area
25549 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25550 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25551 Store =
25552 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25553 MemOps.push_back(Store);
25554
25555 // Store ptr to reg_save_area.
25556 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25557 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25558 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25559 Store = DAG.getStore(
25560 Op.getOperand(0), DL, RSFIN, FIN,
25561 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25562 MemOps.push_back(Store);
25563 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25564}
25565
25566SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25567 assert(Subtarget.is64Bit() &&
25568 "LowerVAARG only handles 64-bit va_arg!");
25569 assert(Op.getNumOperands() == 4);
25570
25572 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25573 // The Win64 ABI uses char* instead of a structure.
25574 return DAG.expandVAArg(Op.getNode());
25575
25576 SDValue Chain = Op.getOperand(0);
25577 SDValue SrcPtr = Op.getOperand(1);
25578 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25579 unsigned Align = Op.getConstantOperandVal(3);
25580 SDLoc dl(Op);
25581
25582 EVT ArgVT = Op.getNode()->getValueType(0);
25583 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25584 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25585 uint8_t ArgMode;
25586
25587 // Decide which area this value should be read from.
25588 // TODO: Implement the AMD64 ABI in its entirety. This simple
25589 // selection mechanism works only for the basic types.
25590 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25591 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25592 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25593 } else {
25594 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25595 "Unhandled argument type in LowerVAARG");
25596 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25597 }
25598
25599 if (ArgMode == 2) {
25600 // Make sure using fp_offset makes sense.
25601 assert(!Subtarget.useSoftFloat() &&
25602 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25603 Subtarget.hasSSE1());
25604 }
25605
25606 // Insert VAARG node into the DAG
25607 // VAARG returns two values: Variable Argument Address, Chain
25608 SDValue InstOps[] = {Chain, SrcPtr,
25609 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25610 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25611 DAG.getTargetConstant(Align, dl, MVT::i32)};
25612 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25615 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25616 /*Alignment=*/std::nullopt,
25618 Chain = VAARG.getValue(1);
25619
25620 // Load the next argument and return it
25621 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25622}
25623
25624static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25625 SelectionDAG &DAG) {
25626 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25627 // where a va_list is still an i8*.
25628 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25629 if (Subtarget.isCallingConvWin64(
25631 // Probably a Win64 va_copy.
25632 return DAG.expandVACopy(Op.getNode());
25633
25634 SDValue Chain = Op.getOperand(0);
25635 SDValue DstPtr = Op.getOperand(1);
25636 SDValue SrcPtr = Op.getOperand(2);
25637 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25638 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25639 SDLoc DL(Op);
25640
25641 return DAG.getMemcpy(
25642 Chain, DL, DstPtr, SrcPtr,
25643 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25644 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25645 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
25646 MachinePointerInfo(SrcSV));
25647}
25648
25649// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25650static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25651 switch (Opc) {
25652 case ISD::SHL:
25653 case X86ISD::VSHL:
25654 case X86ISD::VSHLI:
25655 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25656 case ISD::SRL:
25657 case X86ISD::VSRL:
25658 case X86ISD::VSRLI:
25659 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25660 case ISD::SRA:
25661 case X86ISD::VSRA:
25662 case X86ISD::VSRAI:
25663 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25664 }
25665 llvm_unreachable("Unknown target vector shift node");
25666}
25667
25668/// Handle vector element shifts where the shift amount is a constant.
25669/// Takes immediate version of shift as input.
25670static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25671 SDValue SrcOp, uint64_t ShiftAmt,
25672 SelectionDAG &DAG) {
25673 MVT ElementType = VT.getVectorElementType();
25674
25675 // Bitcast the source vector to the output type, this is mainly necessary for
25676 // vXi8/vXi64 shifts.
25677 if (VT != SrcOp.getSimpleValueType())
25678 SrcOp = DAG.getBitcast(VT, SrcOp);
25679
25680 // Fold this packed shift into its first operand if ShiftAmt is 0.
25681 if (ShiftAmt == 0)
25682 return SrcOp;
25683
25684 // Check for ShiftAmt >= element width
25685 if (ShiftAmt >= ElementType.getSizeInBits()) {
25686 if (Opc == X86ISD::VSRAI)
25687 ShiftAmt = ElementType.getSizeInBits() - 1;
25688 else
25689 return DAG.getConstant(0, dl, VT);
25690 }
25691
25692 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25693 && "Unknown target vector shift-by-constant node");
25694
25695 // Fold this packed vector shift into a build vector if SrcOp is a
25696 // vector of Constants or UNDEFs.
25698 unsigned ShiftOpc;
25699 switch (Opc) {
25700 default: llvm_unreachable("Unknown opcode!");
25701 case X86ISD::VSHLI:
25702 ShiftOpc = ISD::SHL;
25703 break;
25704 case X86ISD::VSRLI:
25705 ShiftOpc = ISD::SRL;
25706 break;
25707 case X86ISD::VSRAI:
25708 ShiftOpc = ISD::SRA;
25709 break;
25710 }
25711
25712 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25713 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25714 return C;
25715 }
25716
25717 return DAG.getNode(Opc, dl, VT, SrcOp,
25718 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25719}
25720
25721/// Handle vector element shifts by a splat shift amount
25722static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25723 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25724 const X86Subtarget &Subtarget,
25725 SelectionDAG &DAG) {
25726 MVT AmtVT = ShAmt.getSimpleValueType();
25727 assert(AmtVT.isVector() && "Vector shift type mismatch");
25728 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25729 "Illegal vector splat index");
25730
25731 // Move the splat element to the bottom element.
25732 if (ShAmtIdx != 0) {
25733 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25734 Mask[0] = ShAmtIdx;
25735 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25736 }
25737
25738 // Peek through any zext node if we can get back to a 128-bit source.
25739 if (AmtVT.getScalarSizeInBits() == 64 &&
25740 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25742 ShAmt.getOperand(0).getValueType().isSimple() &&
25743 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25744 ShAmt = ShAmt.getOperand(0);
25745 AmtVT = ShAmt.getSimpleValueType();
25746 }
25747
25748 // See if we can mask off the upper elements using the existing source node.
25749 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25750 // do this for vXi64 types.
25751 bool IsMasked = false;
25752 if (AmtVT.getScalarSizeInBits() < 64) {
25753 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25754 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25755 // If the shift amount has come from a scalar, then zero-extend the scalar
25756 // before moving to the vector.
25757 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25758 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25759 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25760 AmtVT = MVT::v4i32;
25761 IsMasked = true;
25762 } else if (ShAmt.getOpcode() == ISD::AND) {
25763 // See if the shift amount is already masked (e.g. for rotation modulo),
25764 // then we can zero-extend it by setting all the other mask elements to
25765 // zero.
25766 SmallVector<SDValue> MaskElts(
25767 AmtVT.getVectorNumElements(),
25768 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25769 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25770 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25771 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25772 {ShAmt.getOperand(1), Mask}))) {
25773 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25774 IsMasked = true;
25775 }
25776 }
25777 }
25778
25779 // Extract if the shift amount vector is larger than 128-bits.
25780 if (AmtVT.getSizeInBits() > 128) {
25781 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25782 AmtVT = ShAmt.getSimpleValueType();
25783 }
25784
25785 // Zero-extend bottom element to v2i64 vector type, either by extension or
25786 // shuffle masking.
25787 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25788 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25789 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25790 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25791 } else if (Subtarget.hasSSE41()) {
25792 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25793 MVT::v2i64, ShAmt);
25794 } else {
25795 SDValue ByteShift = DAG.getTargetConstant(
25796 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25797 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25798 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25799 ByteShift);
25800 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25801 ByteShift);
25802 }
25803 }
25804
25805 // Change opcode to non-immediate version.
25806 Opc = getTargetVShiftUniformOpcode(Opc, true);
25807
25808 // The return type has to be a 128-bit type with the same element
25809 // type as the input type.
25810 MVT EltVT = VT.getVectorElementType();
25811 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25812
25813 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25814 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25815}
25816
25817/// Return Mask with the necessary casting or extending
25818/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25819static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25820 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25821 const SDLoc &dl) {
25822
25823 if (isAllOnesConstant(Mask))
25824 return DAG.getConstant(1, dl, MaskVT);
25825 if (X86::isZeroNode(Mask))
25826 return DAG.getConstant(0, dl, MaskVT);
25827
25828 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25829
25830 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25831 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25832 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25833 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25834 SDValue Lo, Hi;
25835 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25836 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25837 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25838 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25839 } else {
25840 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25841 Mask.getSimpleValueType().getSizeInBits());
25842 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25843 // are extracted by EXTRACT_SUBVECTOR.
25844 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25845 DAG.getBitcast(BitcastVT, Mask),
25846 DAG.getVectorIdxConstant(0, dl));
25847 }
25848}
25849
25850/// Return (and \p Op, \p Mask) for compare instructions or
25851/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25852/// necessary casting or extending for \p Mask when lowering masking intrinsics
25854 SDValue PreservedSrc,
25855 const X86Subtarget &Subtarget,
25856 SelectionDAG &DAG) {
25857 MVT VT = Op.getSimpleValueType();
25858 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25859 unsigned OpcodeSelect = ISD::VSELECT;
25860 SDLoc dl(Op);
25861
25862 if (isAllOnesConstant(Mask))
25863 return Op;
25864
25865 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25866
25867 if (PreservedSrc.isUndef())
25868 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25869 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25870}
25871
25872/// Creates an SDNode for a predicated scalar operation.
25873/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25874/// The mask is coming as MVT::i8 and it should be transformed
25875/// to MVT::v1i1 while lowering masking intrinsics.
25876/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25877/// "X86select" instead of "vselect". We just can't create the "vselect" node
25878/// for a scalar instruction.
25880 SDValue PreservedSrc,
25881 const X86Subtarget &Subtarget,
25882 SelectionDAG &DAG) {
25883
25884 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25885 if (MaskConst->getZExtValue() & 0x1)
25886 return Op;
25887
25888 MVT VT = Op.getSimpleValueType();
25889 SDLoc dl(Op);
25890
25891 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25892 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25893 DAG.getBitcast(MVT::v8i1, Mask),
25894 DAG.getVectorIdxConstant(0, dl));
25895 if (Op.getOpcode() == X86ISD::FSETCCM ||
25896 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25897 Op.getOpcode() == X86ISD::VFPCLASSS)
25898 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25899
25900 if (PreservedSrc.isUndef())
25901 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25902 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25903}
25904
25906 if (!Fn->hasPersonalityFn())
25908 "querying registration node size for function without personality");
25909 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25910 // WinEHStatePass for the full struct definition.
25911 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25912 case EHPersonality::MSVC_X86SEH: return 24;
25913 case EHPersonality::MSVC_CXX: return 16;
25914 default: break;
25915 }
25917 "can only recover FP for 32-bit MSVC EH personality functions");
25918}
25919
25920/// When the MSVC runtime transfers control to us, either to an outlined
25921/// function or when returning to a parent frame after catching an exception, we
25922/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25923/// Here's the math:
25924/// RegNodeBase = EntryEBP - RegNodeSize
25925/// ParentFP = RegNodeBase - ParentFrameOffset
25926/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25927/// subtracting the offset (negative on x86) takes us back to the parent FP.
25929 SDValue EntryEBP) {
25931 SDLoc dl;
25932
25933 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25934 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25935
25936 // It's possible that the parent function no longer has a personality function
25937 // if the exceptional code was optimized away, in which case we just return
25938 // the incoming EBP.
25939 if (!Fn->hasPersonalityFn())
25940 return EntryEBP;
25941
25942 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25943 // registration, or the .set_setframe offset.
25946 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25947 SDValue ParentFrameOffset =
25948 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25949
25950 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25951 // prologue to RBP in the parent function.
25952 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25953 if (Subtarget.is64Bit())
25954 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25955
25956 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25957 // RegNodeBase = EntryEBP - RegNodeSize
25958 // ParentFP = RegNodeBase - ParentFrameOffset
25959 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25960 DAG.getConstant(RegNodeSize, dl, PtrVT));
25961 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25962}
25963
25964SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25965 SelectionDAG &DAG) const {
25966 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25967 auto isRoundModeCurDirection = [](SDValue Rnd) {
25968 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25969 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25970
25971 return false;
25972 };
25973 auto isRoundModeSAE = [](SDValue Rnd) {
25974 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25975 unsigned RC = C->getZExtValue();
25977 // Clear the NO_EXC bit and check remaining bits.
25979 // As a convenience we allow no other bits or explicitly
25980 // current direction.
25981 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25982 }
25983 }
25984
25985 return false;
25986 };
25987 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25988 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25989 RC = C->getZExtValue();
25991 // Clear the NO_EXC bit and check remaining bits.
25997 }
25998 }
25999
26000 return false;
26001 };
26002
26003 SDLoc dl(Op);
26004 unsigned IntNo = Op.getConstantOperandVal(0);
26005 MVT VT = Op.getSimpleValueType();
26006 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26007
26008 // Propagate flags from original node to transformed node(s).
26009 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26010
26011 if (IntrData) {
26012 switch(IntrData->Type) {
26013 case INTR_TYPE_1OP: {
26014 // We specify 2 possible opcodes for intrinsics with rounding modes.
26015 // First, we check if the intrinsic may have non-default rounding mode,
26016 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26017 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26018 if (IntrWithRoundingModeOpcode != 0) {
26019 SDValue Rnd = Op.getOperand(2);
26020 unsigned RC = 0;
26021 if (isRoundModeSAEToX(Rnd, RC))
26022 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26023 Op.getOperand(1),
26024 DAG.getTargetConstant(RC, dl, MVT::i32));
26025 if (!isRoundModeCurDirection(Rnd))
26026 return SDValue();
26027 }
26028 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26029 Op.getOperand(1));
26030 }
26031 case INTR_TYPE_1OP_SAE: {
26032 SDValue Sae = Op.getOperand(2);
26033
26034 unsigned Opc;
26035 if (isRoundModeCurDirection(Sae))
26036 Opc = IntrData->Opc0;
26037 else if (isRoundModeSAE(Sae))
26038 Opc = IntrData->Opc1;
26039 else
26040 return SDValue();
26041
26042 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26043 }
26044 case INTR_TYPE_2OP: {
26045 SDValue Src2 = Op.getOperand(2);
26046
26047 // We specify 2 possible opcodes for intrinsics with rounding modes.
26048 // First, we check if the intrinsic may have non-default rounding mode,
26049 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26050 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26051 if (IntrWithRoundingModeOpcode != 0) {
26052 SDValue Rnd = Op.getOperand(3);
26053 unsigned RC = 0;
26054 if (isRoundModeSAEToX(Rnd, RC))
26055 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26056 Op.getOperand(1), Src2,
26057 DAG.getTargetConstant(RC, dl, MVT::i32));
26058 if (!isRoundModeCurDirection(Rnd))
26059 return SDValue();
26060 }
26061
26062 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26063 Op.getOperand(1), Src2);
26064 }
26065 case INTR_TYPE_2OP_SAE: {
26066 SDValue Sae = Op.getOperand(3);
26067
26068 unsigned Opc;
26069 if (isRoundModeCurDirection(Sae))
26070 Opc = IntrData->Opc0;
26071 else if (isRoundModeSAE(Sae))
26072 Opc = IntrData->Opc1;
26073 else
26074 return SDValue();
26075
26076 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26077 Op.getOperand(2));
26078 }
26079 case INTR_TYPE_3OP:
26080 case INTR_TYPE_3OP_IMM8: {
26081 SDValue Src1 = Op.getOperand(1);
26082 SDValue Src2 = Op.getOperand(2);
26083 SDValue Src3 = Op.getOperand(3);
26084
26085 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26086 Src3.getValueType() != MVT::i8) {
26087 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26088 }
26089
26090 // We specify 2 possible opcodes for intrinsics with rounding modes.
26091 // First, we check if the intrinsic may have non-default rounding mode,
26092 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26093 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26094 if (IntrWithRoundingModeOpcode != 0) {
26095 SDValue Rnd = Op.getOperand(4);
26096 unsigned RC = 0;
26097 if (isRoundModeSAEToX(Rnd, RC))
26098 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26099 Src1, Src2, Src3,
26100 DAG.getTargetConstant(RC, dl, MVT::i32));
26101 if (!isRoundModeCurDirection(Rnd))
26102 return SDValue();
26103 }
26104
26105 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26106 {Src1, Src2, Src3});
26107 }
26108 case INTR_TYPE_4OP_IMM8: {
26109 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26110 SDValue Src4 = Op.getOperand(4);
26111 if (Src4.getValueType() != MVT::i8) {
26112 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26113 }
26114
26115 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26116 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26117 Src4);
26118 }
26119 case INTR_TYPE_1OP_MASK: {
26120 SDValue Src = Op.getOperand(1);
26121 SDValue PassThru = Op.getOperand(2);
26122 SDValue Mask = Op.getOperand(3);
26123 // We add rounding mode to the Node when
26124 // - RC Opcode is specified and
26125 // - RC is not "current direction".
26126 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26127 if (IntrWithRoundingModeOpcode != 0) {
26128 SDValue Rnd = Op.getOperand(4);
26129 unsigned RC = 0;
26130 if (isRoundModeSAEToX(Rnd, RC))
26131 return getVectorMaskingNode(
26132 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26133 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26134 Mask, PassThru, Subtarget, DAG);
26135 if (!isRoundModeCurDirection(Rnd))
26136 return SDValue();
26137 }
26138 return getVectorMaskingNode(
26139 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26140 Subtarget, DAG);
26141 }
26143 SDValue Src = Op.getOperand(1);
26144 SDValue PassThru = Op.getOperand(2);
26145 SDValue Mask = Op.getOperand(3);
26146 SDValue Rnd = Op.getOperand(4);
26147
26148 unsigned Opc;
26149 if (isRoundModeCurDirection(Rnd))
26150 Opc = IntrData->Opc0;
26151 else if (isRoundModeSAE(Rnd))
26152 Opc = IntrData->Opc1;
26153 else
26154 return SDValue();
26155
26156 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26157 Subtarget, DAG);
26158 }
26159 case INTR_TYPE_SCALAR_MASK: {
26160 SDValue Src1 = Op.getOperand(1);
26161 SDValue Src2 = Op.getOperand(2);
26162 SDValue passThru = Op.getOperand(3);
26163 SDValue Mask = Op.getOperand(4);
26164 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26165 // There are 2 kinds of intrinsics in this group:
26166 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26167 // (2) With rounding mode and sae - 7 operands.
26168 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26169 if (Op.getNumOperands() == (5U + HasRounding)) {
26170 if (HasRounding) {
26171 SDValue Rnd = Op.getOperand(5);
26172 unsigned RC = 0;
26173 if (isRoundModeSAEToX(Rnd, RC))
26174 return getScalarMaskingNode(
26175 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26176 DAG.getTargetConstant(RC, dl, MVT::i32)),
26177 Mask, passThru, Subtarget, DAG);
26178 if (!isRoundModeCurDirection(Rnd))
26179 return SDValue();
26180 }
26181 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26182 Src2),
26183 Mask, passThru, Subtarget, DAG);
26184 }
26185
26186 assert(Op.getNumOperands() == (6U + HasRounding) &&
26187 "Unexpected intrinsic form");
26188 SDValue RoundingMode = Op.getOperand(5);
26189 unsigned Opc = IntrData->Opc0;
26190 if (HasRounding) {
26191 SDValue Sae = Op.getOperand(6);
26192 if (isRoundModeSAE(Sae))
26193 Opc = IntrWithRoundingModeOpcode;
26194 else if (!isRoundModeCurDirection(Sae))
26195 return SDValue();
26196 }
26197 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26198 Src2, RoundingMode),
26199 Mask, passThru, Subtarget, DAG);
26200 }
26202 SDValue Src1 = Op.getOperand(1);
26203 SDValue Src2 = Op.getOperand(2);
26204 SDValue passThru = Op.getOperand(3);
26205 SDValue Mask = Op.getOperand(4);
26206 SDValue Rnd = Op.getOperand(5);
26207
26208 SDValue NewOp;
26209 unsigned RC = 0;
26210 if (isRoundModeCurDirection(Rnd))
26211 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26212 else if (isRoundModeSAEToX(Rnd, RC))
26213 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26214 DAG.getTargetConstant(RC, dl, MVT::i32));
26215 else
26216 return SDValue();
26217
26218 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26219 }
26221 SDValue Src1 = Op.getOperand(1);
26222 SDValue Src2 = Op.getOperand(2);
26223 SDValue passThru = Op.getOperand(3);
26224 SDValue Mask = Op.getOperand(4);
26225 SDValue Sae = Op.getOperand(5);
26226 unsigned Opc;
26227 if (isRoundModeCurDirection(Sae))
26228 Opc = IntrData->Opc0;
26229 else if (isRoundModeSAE(Sae))
26230 Opc = IntrData->Opc1;
26231 else
26232 return SDValue();
26233
26234 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26235 Mask, passThru, Subtarget, DAG);
26236 }
26237 case INTR_TYPE_2OP_MASK: {
26238 SDValue Src1 = Op.getOperand(1);
26239 SDValue Src2 = Op.getOperand(2);
26240 SDValue PassThru = Op.getOperand(3);
26241 SDValue Mask = Op.getOperand(4);
26242 SDValue NewOp;
26243 if (IntrData->Opc1 != 0) {
26244 SDValue Rnd = Op.getOperand(5);
26245 unsigned RC = 0;
26246 if (isRoundModeSAEToX(Rnd, RC))
26247 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26248 DAG.getTargetConstant(RC, dl, MVT::i32));
26249 else if (!isRoundModeCurDirection(Rnd))
26250 return SDValue();
26251 }
26252 if (!NewOp)
26253 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26254 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26255 }
26257 SDValue Src1 = Op.getOperand(1);
26258 SDValue Src2 = Op.getOperand(2);
26259 SDValue PassThru = Op.getOperand(3);
26260 SDValue Mask = Op.getOperand(4);
26261
26262 unsigned Opc = IntrData->Opc0;
26263 if (IntrData->Opc1 != 0) {
26264 SDValue Sae = Op.getOperand(5);
26265 if (isRoundModeSAE(Sae))
26266 Opc = IntrData->Opc1;
26267 else if (!isRoundModeCurDirection(Sae))
26268 return SDValue();
26269 }
26270
26271 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26272 Mask, PassThru, Subtarget, DAG);
26273 }
26275 SDValue Src1 = Op.getOperand(1);
26276 SDValue Src2 = Op.getOperand(2);
26277 SDValue Src3 = Op.getOperand(3);
26278 SDValue PassThru = Op.getOperand(4);
26279 SDValue Mask = Op.getOperand(5);
26280 SDValue Sae = Op.getOperand(6);
26281 unsigned Opc;
26282 if (isRoundModeCurDirection(Sae))
26283 Opc = IntrData->Opc0;
26284 else if (isRoundModeSAE(Sae))
26285 Opc = IntrData->Opc1;
26286 else
26287 return SDValue();
26288
26289 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26290 Mask, PassThru, Subtarget, DAG);
26291 }
26293 SDValue Src1 = Op.getOperand(1);
26294 SDValue Src2 = Op.getOperand(2);
26295 SDValue Src3 = Op.getOperand(3);
26296 SDValue PassThru = Op.getOperand(4);
26297 SDValue Mask = Op.getOperand(5);
26298
26299 unsigned Opc = IntrData->Opc0;
26300 if (IntrData->Opc1 != 0) {
26301 SDValue Sae = Op.getOperand(6);
26302 if (isRoundModeSAE(Sae))
26303 Opc = IntrData->Opc1;
26304 else if (!isRoundModeCurDirection(Sae))
26305 return SDValue();
26306 }
26307 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26308 Mask, PassThru, Subtarget, DAG);
26309 }
26310 case BLENDV: {
26311 SDValue Src1 = Op.getOperand(1);
26312 SDValue Src2 = Op.getOperand(2);
26313 SDValue Src3 = Op.getOperand(3);
26314
26316 Src3 = DAG.getBitcast(MaskVT, Src3);
26317
26318 // Reverse the operands to match VSELECT order.
26319 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26320 }
26321 case VPERM_2OP : {
26322 SDValue Src1 = Op.getOperand(1);
26323 SDValue Src2 = Op.getOperand(2);
26324
26325 // Swap Src1 and Src2 in the node creation
26326 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26327 }
26328 case CFMA_OP_MASKZ:
26329 case CFMA_OP_MASK: {
26330 SDValue Src1 = Op.getOperand(1);
26331 SDValue Src2 = Op.getOperand(2);
26332 SDValue Src3 = Op.getOperand(3);
26333 SDValue Mask = Op.getOperand(4);
26334 MVT VT = Op.getSimpleValueType();
26335
26336 SDValue PassThru = Src3;
26337 if (IntrData->Type == CFMA_OP_MASKZ)
26338 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26339
26340 // We add rounding mode to the Node when
26341 // - RC Opcode is specified and
26342 // - RC is not "current direction".
26343 SDValue NewOp;
26344 if (IntrData->Opc1 != 0) {
26345 SDValue Rnd = Op.getOperand(5);
26346 unsigned RC = 0;
26347 if (isRoundModeSAEToX(Rnd, RC))
26348 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26349 DAG.getTargetConstant(RC, dl, MVT::i32));
26350 else if (!isRoundModeCurDirection(Rnd))
26351 return SDValue();
26352 }
26353 if (!NewOp)
26354 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26355 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26356 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26357 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26358 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26359 }
26360 case IFMA_OP:
26361 // NOTE: We need to swizzle the operands to pass the multiply operands
26362 // first.
26363 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26364 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26365 case FPCLASSS: {
26366 SDValue Src1 = Op.getOperand(1);
26367 SDValue Imm = Op.getOperand(2);
26368 SDValue Mask = Op.getOperand(3);
26369 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26370 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26371 Subtarget, DAG);
26372 // Need to fill with zeros to ensure the bitcast will produce zeroes
26373 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26374 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26375 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26376 DAG.getVectorIdxConstant(0, dl));
26377 return DAG.getBitcast(MVT::i8, Ins);
26378 }
26379
26380 case CMP_MASK_CC: {
26381 MVT MaskVT = Op.getSimpleValueType();
26382 SDValue CC = Op.getOperand(3);
26383 SDValue Mask = Op.getOperand(4);
26384 // We specify 2 possible opcodes for intrinsics with rounding modes.
26385 // First, we check if the intrinsic may have non-default rounding mode,
26386 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26387 if (IntrData->Opc1 != 0) {
26388 SDValue Sae = Op.getOperand(5);
26389 if (isRoundModeSAE(Sae))
26390 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26391 Op.getOperand(2), CC, Mask, Sae);
26392 if (!isRoundModeCurDirection(Sae))
26393 return SDValue();
26394 }
26395 //default rounding mode
26396 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26397 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26398 }
26399 case CMP_MASK_SCALAR_CC: {
26400 SDValue Src1 = Op.getOperand(1);
26401 SDValue Src2 = Op.getOperand(2);
26402 SDValue CC = Op.getOperand(3);
26403 SDValue Mask = Op.getOperand(4);
26404
26405 SDValue Cmp;
26406 if (IntrData->Opc1 != 0) {
26407 SDValue Sae = Op.getOperand(5);
26408 if (isRoundModeSAE(Sae))
26409 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26410 else if (!isRoundModeCurDirection(Sae))
26411 return SDValue();
26412 }
26413 //default rounding mode
26414 if (!Cmp.getNode())
26415 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26416
26417 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26418 Subtarget, DAG);
26419 // Need to fill with zeros to ensure the bitcast will produce zeroes
26420 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26421 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26422 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26423 DAG.getVectorIdxConstant(0, dl));
26424 return DAG.getBitcast(MVT::i8, Ins);
26425 }
26426 case COMI: { // Comparison intrinsics
26427 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26428 SDValue LHS = Op.getOperand(1);
26429 SDValue RHS = Op.getOperand(2);
26430 // Some conditions require the operands to be swapped.
26431 if (CC == ISD::SETLT || CC == ISD::SETLE)
26432 std::swap(LHS, RHS);
26433
26434 // For AVX10.2, Support EQ and NE.
26435 bool HasAVX10_2_COMX =
26436 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26437
26438 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26439 // For BF type we need to fall back.
26440 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26441
26442 auto ComiOpCode = IntrData->Opc0;
26443 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26444
26445 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26446 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26447
26448 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26449
26450 SDValue SetCC;
26451 switch (CC) {
26452 case ISD::SETEQ: {
26453 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26454 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26455 break;
26456 // (ZF = 1 and PF = 0)
26457 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26458 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26459 break;
26460 }
26461 case ISD::SETNE: {
26462 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26463 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26464 break;
26465 // (ZF = 0 or PF = 1)
26466 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26467 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26468 break;
26469 }
26470 case ISD::SETGT: // (CF = 0 and ZF = 0)
26471 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26472 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26473 break;
26474 }
26475 case ISD::SETGE: // CF = 0
26476 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26477 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26478 break;
26479 default:
26480 llvm_unreachable("Unexpected illegal condition!");
26481 }
26482 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26483 }
26484 case COMI_RM: { // Comparison intrinsics with Sae
26485 SDValue LHS = Op.getOperand(1);
26486 SDValue RHS = Op.getOperand(2);
26487 unsigned CondVal = Op.getConstantOperandVal(3);
26488 SDValue Sae = Op.getOperand(4);
26489
26490 SDValue FCmp;
26491 if (isRoundModeCurDirection(Sae))
26492 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26493 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26494 else if (isRoundModeSAE(Sae))
26495 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26496 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26497 else
26498 return SDValue();
26499 // Need to fill with zeros to ensure the bitcast will produce zeroes
26500 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26501 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26502 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26503 DAG.getVectorIdxConstant(0, dl));
26504 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26505 DAG.getBitcast(MVT::i16, Ins));
26506 }
26507 case VSHIFT: {
26508 SDValue SrcOp = Op.getOperand(1);
26509 SDValue ShAmt = Op.getOperand(2);
26510 assert(ShAmt.getValueType() == MVT::i32 &&
26511 "Unexpected VSHIFT amount type");
26512
26513 // Catch shift-by-constant.
26514 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26515 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26516 Op.getSimpleValueType(), SrcOp,
26517 CShAmt->getZExtValue(), DAG);
26518
26519 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26520 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26521 SrcOp, ShAmt, 0, Subtarget, DAG);
26522 }
26524 SDValue Mask = Op.getOperand(3);
26525 SDValue DataToCompress = Op.getOperand(1);
26526 SDValue PassThru = Op.getOperand(2);
26527 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26528 return Op.getOperand(1);
26529
26530 // Avoid false dependency.
26531 if (PassThru.isUndef())
26532 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26533
26534 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26535 Mask);
26536 }
26537 case FIXUPIMM:
26538 case FIXUPIMM_MASKZ: {
26539 SDValue Src1 = Op.getOperand(1);
26540 SDValue Src2 = Op.getOperand(2);
26541 SDValue Src3 = Op.getOperand(3);
26542 SDValue Imm = Op.getOperand(4);
26543 SDValue Mask = Op.getOperand(5);
26544 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26545 ? Src1
26546 : getZeroVector(VT, Subtarget, DAG, dl);
26547
26548 unsigned Opc = IntrData->Opc0;
26549 if (IntrData->Opc1 != 0) {
26550 SDValue Sae = Op.getOperand(6);
26551 if (isRoundModeSAE(Sae))
26552 Opc = IntrData->Opc1;
26553 else if (!isRoundModeCurDirection(Sae))
26554 return SDValue();
26555 }
26556
26557 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26558
26559 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26560 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26561
26562 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26563 }
26564 case ROUNDP: {
26565 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26566 // Clear the upper bits of the rounding immediate so that the legacy
26567 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26568 uint64_t Round = Op.getConstantOperandVal(2);
26569 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26570 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26571 Op.getOperand(1), RoundingMode);
26572 }
26573 case ROUNDS: {
26574 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26575 // Clear the upper bits of the rounding immediate so that the legacy
26576 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26577 uint64_t Round = Op.getConstantOperandVal(3);
26578 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26579 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26580 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26581 }
26582 case BEXTRI: {
26583 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26584
26585 uint64_t Imm = Op.getConstantOperandVal(2);
26586 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26587 Op.getValueType());
26588 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26589 Op.getOperand(1), Control);
26590 }
26591 // ADC/SBB
26592 case ADX: {
26593 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26594 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26595
26596 SDValue Res;
26597 // If the carry in is zero, then we should just use ADD/SUB instead of
26598 // ADC/SBB.
26599 if (isNullConstant(Op.getOperand(1))) {
26600 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26601 Op.getOperand(3));
26602 } else {
26603 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26604 DAG.getAllOnesConstant(dl, MVT::i8));
26605 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26606 Op.getOperand(3), GenCF.getValue(1));
26607 }
26608 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26609 SDValue Results[] = { SetCC, Res };
26610 return DAG.getMergeValues(Results, dl);
26611 }
26612 case CVTPD2PS_MASK:
26613 case CVTPD2DQ_MASK:
26614 case CVTQQ2PS_MASK:
26615 case TRUNCATE_TO_REG: {
26616 SDValue Src = Op.getOperand(1);
26617 SDValue PassThru = Op.getOperand(2);
26618 SDValue Mask = Op.getOperand(3);
26619
26620 if (isAllOnesConstant(Mask))
26621 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26622
26623 MVT SrcVT = Src.getSimpleValueType();
26624 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26625 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26626 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26627 {Src, PassThru, Mask});
26628 }
26629 case TRUNCATE2_TO_REG: {
26630 SDValue Src = Op.getOperand(1);
26631 SDValue Src2 = Op.getOperand(2);
26632 SDValue PassThru = Op.getOperand(3);
26633 SDValue Mask = Op.getOperand(4);
26634
26635 if (isAllOnesConstant(Mask))
26636 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
26637
26638 MVT Src2VT = Src2.getSimpleValueType();
26639 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
26640 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26641 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26642 {Src, Src2, PassThru, Mask});
26643 }
26644 case CVTPS2PH_MASK: {
26645 SDValue Src = Op.getOperand(1);
26646 SDValue Rnd = Op.getOperand(2);
26647 SDValue PassThru = Op.getOperand(3);
26648 SDValue Mask = Op.getOperand(4);
26649
26650 unsigned RC = 0;
26651 unsigned Opc = IntrData->Opc0;
26652 bool SAE = Src.getValueType().is512BitVector() &&
26653 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26654 if (SAE) {
26656 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26657 }
26658
26659 if (isAllOnesConstant(Mask))
26660 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26661
26662 if (SAE)
26664 else
26665 Opc = IntrData->Opc1;
26666 MVT SrcVT = Src.getSimpleValueType();
26667 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26668 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26669 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26670 }
26671 case CVTNEPS2BF16_MASK: {
26672 SDValue Src = Op.getOperand(1);
26673 SDValue PassThru = Op.getOperand(2);
26674 SDValue Mask = Op.getOperand(3);
26675
26676 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26677 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26678
26679 // Break false dependency.
26680 if (PassThru.isUndef())
26681 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26682
26683 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26684 Mask);
26685 }
26686 default:
26687 break;
26688 }
26689 }
26690
26691 switch (IntNo) {
26692 default: return SDValue(); // Don't custom lower most intrinsics.
26693
26694 // ptest and testp intrinsics. The intrinsic these come from are designed to
26695 // return an integer value, not just an instruction so lower it to the ptest
26696 // or testp pattern and a setcc for the result.
26697 case Intrinsic::x86_avx512_ktestc_b:
26698 case Intrinsic::x86_avx512_ktestc_w:
26699 case Intrinsic::x86_avx512_ktestc_d:
26700 case Intrinsic::x86_avx512_ktestc_q:
26701 case Intrinsic::x86_avx512_ktestz_b:
26702 case Intrinsic::x86_avx512_ktestz_w:
26703 case Intrinsic::x86_avx512_ktestz_d:
26704 case Intrinsic::x86_avx512_ktestz_q:
26705 case Intrinsic::x86_sse41_ptestz:
26706 case Intrinsic::x86_sse41_ptestc:
26707 case Intrinsic::x86_sse41_ptestnzc:
26708 case Intrinsic::x86_avx_ptestz_256:
26709 case Intrinsic::x86_avx_ptestc_256:
26710 case Intrinsic::x86_avx_ptestnzc_256:
26711 case Intrinsic::x86_avx_vtestz_ps:
26712 case Intrinsic::x86_avx_vtestc_ps:
26713 case Intrinsic::x86_avx_vtestnzc_ps:
26714 case Intrinsic::x86_avx_vtestz_pd:
26715 case Intrinsic::x86_avx_vtestc_pd:
26716 case Intrinsic::x86_avx_vtestnzc_pd:
26717 case Intrinsic::x86_avx_vtestz_ps_256:
26718 case Intrinsic::x86_avx_vtestc_ps_256:
26719 case Intrinsic::x86_avx_vtestnzc_ps_256:
26720 case Intrinsic::x86_avx_vtestz_pd_256:
26721 case Intrinsic::x86_avx_vtestc_pd_256:
26722 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26723 unsigned TestOpc = X86ISD::PTEST;
26724 X86::CondCode X86CC;
26725 switch (IntNo) {
26726 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26727 case Intrinsic::x86_avx512_ktestc_b:
26728 case Intrinsic::x86_avx512_ktestc_w:
26729 case Intrinsic::x86_avx512_ktestc_d:
26730 case Intrinsic::x86_avx512_ktestc_q:
26731 // CF = 1
26732 TestOpc = X86ISD::KTEST;
26733 X86CC = X86::COND_B;
26734 break;
26735 case Intrinsic::x86_avx512_ktestz_b:
26736 case Intrinsic::x86_avx512_ktestz_w:
26737 case Intrinsic::x86_avx512_ktestz_d:
26738 case Intrinsic::x86_avx512_ktestz_q:
26739 TestOpc = X86ISD::KTEST;
26740 X86CC = X86::COND_E;
26741 break;
26742 case Intrinsic::x86_avx_vtestz_ps:
26743 case Intrinsic::x86_avx_vtestz_pd:
26744 case Intrinsic::x86_avx_vtestz_ps_256:
26745 case Intrinsic::x86_avx_vtestz_pd_256:
26746 TestOpc = X86ISD::TESTP;
26747 [[fallthrough]];
26748 case Intrinsic::x86_sse41_ptestz:
26749 case Intrinsic::x86_avx_ptestz_256:
26750 // ZF = 1
26751 X86CC = X86::COND_E;
26752 break;
26753 case Intrinsic::x86_avx_vtestc_ps:
26754 case Intrinsic::x86_avx_vtestc_pd:
26755 case Intrinsic::x86_avx_vtestc_ps_256:
26756 case Intrinsic::x86_avx_vtestc_pd_256:
26757 TestOpc = X86ISD::TESTP;
26758 [[fallthrough]];
26759 case Intrinsic::x86_sse41_ptestc:
26760 case Intrinsic::x86_avx_ptestc_256:
26761 // CF = 1
26762 X86CC = X86::COND_B;
26763 break;
26764 case Intrinsic::x86_avx_vtestnzc_ps:
26765 case Intrinsic::x86_avx_vtestnzc_pd:
26766 case Intrinsic::x86_avx_vtestnzc_ps_256:
26767 case Intrinsic::x86_avx_vtestnzc_pd_256:
26768 TestOpc = X86ISD::TESTP;
26769 [[fallthrough]];
26770 case Intrinsic::x86_sse41_ptestnzc:
26771 case Intrinsic::x86_avx_ptestnzc_256:
26772 // ZF and CF = 0
26773 X86CC = X86::COND_A;
26774 break;
26775 }
26776
26777 SDValue LHS = Op.getOperand(1);
26778 SDValue RHS = Op.getOperand(2);
26779 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26780 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26781 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26782 }
26783
26784 case Intrinsic::x86_sse42_pcmpistria128:
26785 case Intrinsic::x86_sse42_pcmpestria128:
26786 case Intrinsic::x86_sse42_pcmpistric128:
26787 case Intrinsic::x86_sse42_pcmpestric128:
26788 case Intrinsic::x86_sse42_pcmpistrio128:
26789 case Intrinsic::x86_sse42_pcmpestrio128:
26790 case Intrinsic::x86_sse42_pcmpistris128:
26791 case Intrinsic::x86_sse42_pcmpestris128:
26792 case Intrinsic::x86_sse42_pcmpistriz128:
26793 case Intrinsic::x86_sse42_pcmpestriz128: {
26794 unsigned Opcode;
26795 X86::CondCode X86CC;
26796 switch (IntNo) {
26797 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26798 case Intrinsic::x86_sse42_pcmpistria128:
26799 Opcode = X86ISD::PCMPISTR;
26800 X86CC = X86::COND_A;
26801 break;
26802 case Intrinsic::x86_sse42_pcmpestria128:
26803 Opcode = X86ISD::PCMPESTR;
26804 X86CC = X86::COND_A;
26805 break;
26806 case Intrinsic::x86_sse42_pcmpistric128:
26807 Opcode = X86ISD::PCMPISTR;
26808 X86CC = X86::COND_B;
26809 break;
26810 case Intrinsic::x86_sse42_pcmpestric128:
26811 Opcode = X86ISD::PCMPESTR;
26812 X86CC = X86::COND_B;
26813 break;
26814 case Intrinsic::x86_sse42_pcmpistrio128:
26815 Opcode = X86ISD::PCMPISTR;
26816 X86CC = X86::COND_O;
26817 break;
26818 case Intrinsic::x86_sse42_pcmpestrio128:
26819 Opcode = X86ISD::PCMPESTR;
26820 X86CC = X86::COND_O;
26821 break;
26822 case Intrinsic::x86_sse42_pcmpistris128:
26823 Opcode = X86ISD::PCMPISTR;
26824 X86CC = X86::COND_S;
26825 break;
26826 case Intrinsic::x86_sse42_pcmpestris128:
26827 Opcode = X86ISD::PCMPESTR;
26828 X86CC = X86::COND_S;
26829 break;
26830 case Intrinsic::x86_sse42_pcmpistriz128:
26831 Opcode = X86ISD::PCMPISTR;
26832 X86CC = X86::COND_E;
26833 break;
26834 case Intrinsic::x86_sse42_pcmpestriz128:
26835 Opcode = X86ISD::PCMPESTR;
26836 X86CC = X86::COND_E;
26837 break;
26838 }
26840 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26841 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26842 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26843 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26844 }
26845
26846 case Intrinsic::x86_sse42_pcmpistri128:
26847 case Intrinsic::x86_sse42_pcmpestri128: {
26848 unsigned Opcode;
26849 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26850 Opcode = X86ISD::PCMPISTR;
26851 else
26852 Opcode = X86ISD::PCMPESTR;
26853
26855 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26856 return DAG.getNode(Opcode, dl, VTs, NewOps);
26857 }
26858
26859 case Intrinsic::x86_sse42_pcmpistrm128:
26860 case Intrinsic::x86_sse42_pcmpestrm128: {
26861 unsigned Opcode;
26862 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26863 Opcode = X86ISD::PCMPISTR;
26864 else
26865 Opcode = X86ISD::PCMPESTR;
26866
26868 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26869 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26870 }
26871
26872 case Intrinsic::eh_sjlj_lsda: {
26874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26875 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26876 auto &Context = MF.getContext();
26877 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26878 Twine(MF.getFunctionNumber()));
26879 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26880 DAG.getMCSymbol(S, PtrVT));
26881 }
26882
26883 case Intrinsic::x86_seh_lsda: {
26884 // Compute the symbol for the LSDA. We know it'll get emitted later.
26886 SDValue Op1 = Op.getOperand(1);
26887 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26890
26891 // Generate a simple absolute symbol reference. This intrinsic is only
26892 // supported on 32-bit Windows, which isn't PIC.
26893 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26894 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26895 }
26896
26897 case Intrinsic::eh_recoverfp: {
26898 SDValue FnOp = Op.getOperand(1);
26899 SDValue IncomingFPOp = Op.getOperand(2);
26900 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26901 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26902 if (!Fn)
26904 "llvm.eh.recoverfp must take a function as the first argument");
26905 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26906 }
26907
26908 case Intrinsic::localaddress: {
26909 // Returns one of the stack, base, or frame pointer registers, depending on
26910 // which is used to reference local variables.
26912 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26913 unsigned Reg;
26914 if (RegInfo->hasBasePointer(MF))
26915 Reg = RegInfo->getBaseRegister();
26916 else { // Handles the SP or FP case.
26917 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26918 if (CantUseFP)
26919 Reg = RegInfo->getPtrSizedStackRegister(MF);
26920 else
26921 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26922 }
26923 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26924 }
26925 case Intrinsic::x86_avx512_vp2intersect_q_512:
26926 case Intrinsic::x86_avx512_vp2intersect_q_256:
26927 case Intrinsic::x86_avx512_vp2intersect_q_128:
26928 case Intrinsic::x86_avx512_vp2intersect_d_512:
26929 case Intrinsic::x86_avx512_vp2intersect_d_256:
26930 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26931 MVT MaskVT = Op.getSimpleValueType();
26932
26933 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26934 SDLoc DL(Op);
26935
26938 Op->getOperand(1), Op->getOperand(2));
26939
26940 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26941 MaskVT, Operation);
26942 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26943 MaskVT, Operation);
26944 return DAG.getMergeValues({Result0, Result1}, DL);
26945 }
26946 case Intrinsic::x86_mmx_pslli_w:
26947 case Intrinsic::x86_mmx_pslli_d:
26948 case Intrinsic::x86_mmx_pslli_q:
26949 case Intrinsic::x86_mmx_psrli_w:
26950 case Intrinsic::x86_mmx_psrli_d:
26951 case Intrinsic::x86_mmx_psrli_q:
26952 case Intrinsic::x86_mmx_psrai_w:
26953 case Intrinsic::x86_mmx_psrai_d: {
26954 SDLoc DL(Op);
26955 SDValue ShAmt = Op.getOperand(2);
26956 // If the argument is a constant, convert it to a target constant.
26957 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26958 // Clamp out of bounds shift amounts since they will otherwise be masked
26959 // to 8-bits which may make it no longer out of bounds.
26960 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26961 if (ShiftAmount == 0)
26962 return Op.getOperand(1);
26963
26964 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26965 Op.getOperand(0), Op.getOperand(1),
26966 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26967 }
26968
26969 unsigned NewIntrinsic;
26970 switch (IntNo) {
26971 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26972 case Intrinsic::x86_mmx_pslli_w:
26973 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26974 break;
26975 case Intrinsic::x86_mmx_pslli_d:
26976 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26977 break;
26978 case Intrinsic::x86_mmx_pslli_q:
26979 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26980 break;
26981 case Intrinsic::x86_mmx_psrli_w:
26982 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26983 break;
26984 case Intrinsic::x86_mmx_psrli_d:
26985 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26986 break;
26987 case Intrinsic::x86_mmx_psrli_q:
26988 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26989 break;
26990 case Intrinsic::x86_mmx_psrai_w:
26991 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26992 break;
26993 case Intrinsic::x86_mmx_psrai_d:
26994 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26995 break;
26996 }
26997
26998 // The vector shift intrinsics with scalars uses 32b shift amounts but
26999 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27000 // MMX register.
27001 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27002 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27003 DAG.getTargetConstant(NewIntrinsic, DL,
27005 Op.getOperand(1), ShAmt);
27006 }
27007 case Intrinsic::thread_pointer: {
27008 if (Subtarget.isTargetELF()) {
27009 SDLoc dl(Op);
27010 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27011 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27013 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27014 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27016 }
27018 "Target OS doesn't support __builtin_thread_pointer() yet.");
27019 }
27020 }
27021}
27022
27024 SDValue Src, SDValue Mask, SDValue Base,
27025 SDValue Index, SDValue ScaleOp, SDValue Chain,
27026 const X86Subtarget &Subtarget) {
27027 SDLoc dl(Op);
27028 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27029 // Scale must be constant.
27030 if (!C)
27031 return SDValue();
27032 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27033 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27034 TLI.getPointerTy(DAG.getDataLayout()));
27035 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27036 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27037 // If source is undef or we know it won't be used, use a zero vector
27038 // to break register dependency.
27039 // TODO: use undef instead and let BreakFalseDeps deal with it?
27040 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27041 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27042
27043 // Cast mask to an integer type.
27044 Mask = DAG.getBitcast(MaskVT, Mask);
27045
27046 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27047
27048 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27049 SDValue Res =
27050 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27051 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27052 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27053}
27054
27056 SDValue Src, SDValue Mask, SDValue Base,
27057 SDValue Index, SDValue ScaleOp, SDValue Chain,
27058 const X86Subtarget &Subtarget) {
27059 MVT VT = Op.getSimpleValueType();
27060 SDLoc dl(Op);
27061 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27062 // Scale must be constant.
27063 if (!C)
27064 return SDValue();
27065 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27066 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27067 TLI.getPointerTy(DAG.getDataLayout()));
27068 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27070 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27071
27072 // We support two versions of the gather intrinsics. One with scalar mask and
27073 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27074 if (Mask.getValueType() != MaskVT)
27075 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27076
27077 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27078 // If source is undef or we know it won't be used, use a zero vector
27079 // to break register dependency.
27080 // TODO: use undef instead and let BreakFalseDeps deal with it?
27081 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27082 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27083
27084 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27085
27086 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27087 SDValue Res =
27088 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
27089 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27090 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27091}
27092
27093static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27094 SDValue Src, SDValue Mask, SDValue Base,
27095 SDValue Index, SDValue ScaleOp, SDValue Chain,
27096 const X86Subtarget &Subtarget) {
27097 SDLoc dl(Op);
27098 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27099 // Scale must be constant.
27100 if (!C)
27101 return SDValue();
27102 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27103 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27104 TLI.getPointerTy(DAG.getDataLayout()));
27105 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27106 Src.getSimpleValueType().getVectorNumElements());
27107 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27108
27109 // We support two versions of the scatter intrinsics. One with scalar mask and
27110 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27111 if (Mask.getValueType() != MaskVT)
27112 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27113
27114 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27115
27116 SDVTList VTs = DAG.getVTList(MVT::Other);
27117 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27118 SDValue Res =
27119 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
27120 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27121 return Res;
27122}
27123
27124static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
27125 SDValue Mask, SDValue Base, SDValue Index,
27126 SDValue ScaleOp, SDValue Chain,
27127 const X86Subtarget &Subtarget) {
27128 SDLoc dl(Op);
27129 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27130 // Scale must be constant.
27131 if (!C)
27132 return SDValue();
27133 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27134 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27135 TLI.getPointerTy(DAG.getDataLayout()));
27136 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27137 SDValue Segment = DAG.getRegister(0, MVT::i32);
27138 MVT MaskVT =
27139 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27140 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27141 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27142 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27143 return SDValue(Res, 0);
27144}
27145
27146/// Handles the lowering of builtin intrinsics with chain that return their
27147/// value into registers EDX:EAX.
27148/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27149/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27150/// TargetOpcode.
27151/// Returns a Glue value which can be used to add extra copy-from-reg if the
27152/// expanded intrinsics implicitly defines extra registers (i.e. not just
27153/// EDX:EAX).
27155 SelectionDAG &DAG,
27156 unsigned TargetOpcode,
27157 unsigned SrcReg,
27158 const X86Subtarget &Subtarget,
27160 SDValue Chain = N->getOperand(0);
27161 SDValue Glue;
27162
27163 if (SrcReg) {
27164 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27165 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27166 Glue = Chain.getValue(1);
27167 }
27168
27169 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27170 SDValue N1Ops[] = {Chain, Glue};
27171 SDNode *N1 = DAG.getMachineNode(
27172 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27173 Chain = SDValue(N1, 0);
27174
27175 // Reads the content of XCR and returns it in registers EDX:EAX.
27176 SDValue LO, HI;
27177 if (Subtarget.is64Bit()) {
27178 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27179 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27180 LO.getValue(2));
27181 } else {
27182 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27183 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27184 LO.getValue(2));
27185 }
27186 Chain = HI.getValue(1);
27187 Glue = HI.getValue(2);
27188
27189 if (Subtarget.is64Bit()) {
27190 // Merge the two 32-bit values into a 64-bit one.
27191 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27192 DAG.getConstant(32, DL, MVT::i8));
27193 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27194 Results.push_back(Chain);
27195 return Glue;
27196 }
27197
27198 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27199 SDValue Ops[] = { LO, HI };
27200 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27201 Results.push_back(Pair);
27202 Results.push_back(Chain);
27203 return Glue;
27204}
27205
27206/// Handles the lowering of builtin intrinsics that read the time stamp counter
27207/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27208/// READCYCLECOUNTER nodes.
27209static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27210 SelectionDAG &DAG,
27211 const X86Subtarget &Subtarget,
27213 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27214 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27215 // and the EAX register is loaded with the low-order 32 bits.
27216 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27217 /* NoRegister */0, Subtarget,
27218 Results);
27219 if (Opcode != X86::RDTSCP)
27220 return;
27221
27222 SDValue Chain = Results[1];
27223 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27224 // the ECX register. Add 'ecx' explicitly to the chain.
27225 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27226 Results[1] = ecx;
27227 Results.push_back(ecx.getValue(1));
27228}
27229
27231 SelectionDAG &DAG) {
27233 SDLoc DL(Op);
27234 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27235 Results);
27236 return DAG.getMergeValues(Results, DL);
27237}
27238
27241 SDValue Chain = Op.getOperand(0);
27242 SDValue RegNode = Op.getOperand(2);
27243 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27244 if (!EHInfo)
27245 report_fatal_error("EH registrations only live in functions using WinEH");
27246
27247 // Cast the operand to an alloca, and remember the frame index.
27248 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27249 if (!FINode)
27250 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27251 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27252
27253 // Return the chain operand without making any DAG nodes.
27254 return Chain;
27255}
27256
27259 SDValue Chain = Op.getOperand(0);
27260 SDValue EHGuard = Op.getOperand(2);
27261 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27262 if (!EHInfo)
27263 report_fatal_error("EHGuard only live in functions using WinEH");
27264
27265 // Cast the operand to an alloca, and remember the frame index.
27266 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27267 if (!FINode)
27268 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27269 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27270
27271 // Return the chain operand without making any DAG nodes.
27272 return Chain;
27273}
27274
27275/// Emit Truncating Store with signed or unsigned saturation.
27276static SDValue
27277EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27278 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27279 SelectionDAG &DAG) {
27280 SDVTList VTs = DAG.getVTList(MVT::Other);
27281 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27282 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27283 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27284 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27285}
27286
27287/// Emit Masked Truncating Store with signed or unsigned saturation.
27288static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27289 const SDLoc &DL,
27290 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27291 MachineMemOperand *MMO, SelectionDAG &DAG) {
27292 SDVTList VTs = DAG.getVTList(MVT::Other);
27293 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27294 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27295 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27296}
27297
27299 const MachineFunction &MF) {
27300 if (!Subtarget.is64Bit())
27301 return false;
27302 // 64-bit targets support extended Swift async frame setup,
27303 // except for targets that use the windows 64 prologue.
27304 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27305}
27306
27308 SelectionDAG &DAG) {
27309 unsigned IntNo = Op.getConstantOperandVal(1);
27310 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27311 if (!IntrData) {
27312 switch (IntNo) {
27313
27314 case Intrinsic::swift_async_context_addr: {
27315 SDLoc dl(Op);
27316 auto &MF = DAG.getMachineFunction();
27317 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27318 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27320 X86FI->setHasSwiftAsyncContext(true);
27321 SDValue Chain = Op->getOperand(0);
27322 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27323 SDValue Result =
27324 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27325 DAG.getTargetConstant(8, dl, MVT::i32)),
27326 0);
27327 // Return { result, chain }.
27328 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27329 CopyRBP.getValue(1));
27330 } else {
27331 // No special extended frame, create or reuse an existing stack slot.
27332 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27333 if (!X86FI->getSwiftAsyncContextFrameIdx())
27334 X86FI->setSwiftAsyncContextFrameIdx(
27335 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27336 false));
27337 SDValue Result =
27338 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27339 PtrSize == 8 ? MVT::i64 : MVT::i32);
27340 // Return { result, chain }.
27341 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27342 Op->getOperand(0));
27343 }
27344 }
27345
27346 case llvm::Intrinsic::x86_seh_ehregnode:
27347 return MarkEHRegistrationNode(Op, DAG);
27348 case llvm::Intrinsic::x86_seh_ehguard:
27349 return MarkEHGuard(Op, DAG);
27350 case llvm::Intrinsic::x86_rdpkru: {
27351 SDLoc dl(Op);
27352 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27353 // Create a RDPKRU node and pass 0 to the ECX parameter.
27354 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27355 DAG.getConstant(0, dl, MVT::i32));
27356 }
27357 case llvm::Intrinsic::x86_wrpkru: {
27358 SDLoc dl(Op);
27359 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27360 // to the EDX and ECX parameters.
27361 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27362 Op.getOperand(0), Op.getOperand(2),
27363 DAG.getConstant(0, dl, MVT::i32),
27364 DAG.getConstant(0, dl, MVT::i32));
27365 }
27366 case llvm::Intrinsic::asan_check_memaccess: {
27367 // Mark this as adjustsStack because it will be lowered to a call.
27369 // Don't do anything here, we will expand these intrinsics out later.
27370 return Op;
27371 }
27372 case llvm::Intrinsic::x86_flags_read_u32:
27373 case llvm::Intrinsic::x86_flags_read_u64:
27374 case llvm::Intrinsic::x86_flags_write_u32:
27375 case llvm::Intrinsic::x86_flags_write_u64: {
27376 // We need a frame pointer because this will get lowered to a PUSH/POP
27377 // sequence.
27380 // Don't do anything here, we will expand these intrinsics out later
27381 // during FinalizeISel in EmitInstrWithCustomInserter.
27382 return Op;
27383 }
27384 case Intrinsic::x86_lwpins32:
27385 case Intrinsic::x86_lwpins64:
27386 case Intrinsic::x86_umwait:
27387 case Intrinsic::x86_tpause: {
27388 SDLoc dl(Op);
27389 SDValue Chain = Op->getOperand(0);
27390 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27391 unsigned Opcode;
27392
27393 switch (IntNo) {
27394 default: llvm_unreachable("Impossible intrinsic");
27395 case Intrinsic::x86_umwait:
27396 Opcode = X86ISD::UMWAIT;
27397 break;
27398 case Intrinsic::x86_tpause:
27399 Opcode = X86ISD::TPAUSE;
27400 break;
27401 case Intrinsic::x86_lwpins32:
27402 case Intrinsic::x86_lwpins64:
27403 Opcode = X86ISD::LWPINS;
27404 break;
27405 }
27406
27408 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27409 Op->getOperand(3), Op->getOperand(4));
27410 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27411 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27412 Operation.getValue(1));
27413 }
27414 case Intrinsic::x86_enqcmd:
27415 case Intrinsic::x86_enqcmds: {
27416 SDLoc dl(Op);
27417 SDValue Chain = Op.getOperand(0);
27418 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27419 unsigned Opcode;
27420 switch (IntNo) {
27421 default: llvm_unreachable("Impossible intrinsic!");
27422 case Intrinsic::x86_enqcmd:
27423 Opcode = X86ISD::ENQCMD;
27424 break;
27425 case Intrinsic::x86_enqcmds:
27426 Opcode = X86ISD::ENQCMDS;
27427 break;
27428 }
27429 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27430 Op.getOperand(3));
27431 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27432 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27433 Operation.getValue(1));
27434 }
27435 case Intrinsic::x86_aesenc128kl:
27436 case Intrinsic::x86_aesdec128kl:
27437 case Intrinsic::x86_aesenc256kl:
27438 case Intrinsic::x86_aesdec256kl: {
27439 SDLoc DL(Op);
27440 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27441 SDValue Chain = Op.getOperand(0);
27442 unsigned Opcode;
27443
27444 switch (IntNo) {
27445 default: llvm_unreachable("Impossible intrinsic");
27446 case Intrinsic::x86_aesenc128kl:
27447 Opcode = X86ISD::AESENC128KL;
27448 break;
27449 case Intrinsic::x86_aesdec128kl:
27450 Opcode = X86ISD::AESDEC128KL;
27451 break;
27452 case Intrinsic::x86_aesenc256kl:
27453 Opcode = X86ISD::AESENC256KL;
27454 break;
27455 case Intrinsic::x86_aesdec256kl:
27456 Opcode = X86ISD::AESDEC256KL;
27457 break;
27458 }
27459
27460 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27461 MachineMemOperand *MMO = MemIntr->getMemOperand();
27462 EVT MemVT = MemIntr->getMemoryVT();
27464 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27465 MMO);
27466 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27467
27468 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27469 {ZF, Operation.getValue(0), Operation.getValue(2)});
27470 }
27471 case Intrinsic::x86_aesencwide128kl:
27472 case Intrinsic::x86_aesdecwide128kl:
27473 case Intrinsic::x86_aesencwide256kl:
27474 case Intrinsic::x86_aesdecwide256kl: {
27475 SDLoc DL(Op);
27476 SDVTList VTs = DAG.getVTList(
27477 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27478 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27479 SDValue Chain = Op.getOperand(0);
27480 unsigned Opcode;
27481
27482 switch (IntNo) {
27483 default: llvm_unreachable("Impossible intrinsic");
27484 case Intrinsic::x86_aesencwide128kl:
27485 Opcode = X86ISD::AESENCWIDE128KL;
27486 break;
27487 case Intrinsic::x86_aesdecwide128kl:
27488 Opcode = X86ISD::AESDECWIDE128KL;
27489 break;
27490 case Intrinsic::x86_aesencwide256kl:
27491 Opcode = X86ISD::AESENCWIDE256KL;
27492 break;
27493 case Intrinsic::x86_aesdecwide256kl:
27494 Opcode = X86ISD::AESDECWIDE256KL;
27495 break;
27496 }
27497
27498 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27499 MachineMemOperand *MMO = MemIntr->getMemOperand();
27500 EVT MemVT = MemIntr->getMemoryVT();
27502 Opcode, DL, VTs,
27503 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27504 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27505 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27506 MemVT, MMO);
27507 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27508
27509 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27510 {ZF, Operation.getValue(1), Operation.getValue(2),
27511 Operation.getValue(3), Operation.getValue(4),
27512 Operation.getValue(5), Operation.getValue(6),
27513 Operation.getValue(7), Operation.getValue(8),
27514 Operation.getValue(9)});
27515 }
27516 case Intrinsic::x86_testui: {
27517 SDLoc dl(Op);
27518 SDValue Chain = Op.getOperand(0);
27519 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27520 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27521 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27522 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27523 Operation.getValue(1));
27524 }
27525 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27526 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27527 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27528 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27529 case Intrinsic::x86_t2rpntlvwz0_internal:
27530 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27531 case Intrinsic::x86_t2rpntlvwz1_internal:
27532 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27533 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27535 unsigned IntNo = Op.getConstantOperandVal(1);
27536 unsigned Opc = 0;
27537 switch (IntNo) {
27538 default:
27539 llvm_unreachable("Unexpected intrinsic!");
27540 case Intrinsic::x86_t2rpntlvwz0_internal:
27541 Opc = X86::PT2RPNTLVWZ0V;
27542 break;
27543 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27544 Opc = X86::PT2RPNTLVWZ0T1V;
27545 break;
27546 case Intrinsic::x86_t2rpntlvwz1_internal:
27547 Opc = X86::PT2RPNTLVWZ1V;
27548 break;
27549 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27550 Opc = X86::PT2RPNTLVWZ1T1V;
27551 break;
27552 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27553 Opc = X86::PT2RPNTLVWZ0RSV;
27554 break;
27555 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27556 Opc = X86::PT2RPNTLVWZ0RST1V;
27557 break;
27558 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27559 Opc = X86::PT2RPNTLVWZ1RSV;
27560 break;
27561 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27562 Opc = X86::PT2RPNTLVWZ1RST1V;
27563 break;
27564 }
27565
27566 SDLoc DL(Op);
27567 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27568
27569 SDValue Ops[] = {Op.getOperand(2), // Row
27570 Op.getOperand(3), // Col0
27571 Op.getOperand(4), // Col1
27572 Op.getOperand(5), // Base
27573 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27574 Op.getOperand(6), // Index
27575 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27576 DAG.getRegister(0, MVT::i16), // Segment
27577 Op.getOperand(0)}; // Chain
27578
27579 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27580 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27581 SDValue(Res, 0));
27582 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27583 SDValue(Res, 0));
27584 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27585 }
27586 case Intrinsic::x86_atomic_bts_rm:
27587 case Intrinsic::x86_atomic_btc_rm:
27588 case Intrinsic::x86_atomic_btr_rm: {
27589 SDLoc DL(Op);
27590 MVT VT = Op.getSimpleValueType();
27591 SDValue Chain = Op.getOperand(0);
27592 SDValue Op1 = Op.getOperand(2);
27593 SDValue Op2 = Op.getOperand(3);
27594 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27595 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27597 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27598 SDValue Res =
27599 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27600 {Chain, Op1, Op2}, VT, MMO);
27601 Chain = Res.getValue(1);
27602 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27603 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27604 }
27605 case Intrinsic::x86_atomic_bts:
27606 case Intrinsic::x86_atomic_btc:
27607 case Intrinsic::x86_atomic_btr: {
27608 SDLoc DL(Op);
27609 MVT VT = Op.getSimpleValueType();
27610 SDValue Chain = Op.getOperand(0);
27611 SDValue Op1 = Op.getOperand(2);
27612 SDValue Op2 = Op.getOperand(3);
27613 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
27614 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
27615 : X86ISD::LBTR;
27616 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
27617 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27618 SDValue Res =
27619 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27620 {Chain, Op1, Op2, Size}, VT, MMO);
27621 Chain = Res.getValue(1);
27622 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27623 unsigned Imm = Op2->getAsZExtVal();
27624 if (Imm)
27625 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27626 DAG.getShiftAmountConstant(Imm, VT, DL));
27627 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27628 }
27629 case Intrinsic::x86_cmpccxadd32:
27630 case Intrinsic::x86_cmpccxadd64: {
27631 SDLoc DL(Op);
27632 SDValue Chain = Op.getOperand(0);
27633 SDValue Addr = Op.getOperand(2);
27634 SDValue Src1 = Op.getOperand(3);
27635 SDValue Src2 = Op.getOperand(4);
27636 SDValue CC = Op.getOperand(5);
27637 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27639 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27640 MVT::i32, MMO);
27641 return Operation;
27642 }
27643 case Intrinsic::x86_aadd32:
27644 case Intrinsic::x86_aadd64:
27645 case Intrinsic::x86_aand32:
27646 case Intrinsic::x86_aand64:
27647 case Intrinsic::x86_aor32:
27648 case Intrinsic::x86_aor64:
27649 case Intrinsic::x86_axor32:
27650 case Intrinsic::x86_axor64: {
27651 SDLoc DL(Op);
27652 SDValue Chain = Op.getOperand(0);
27653 SDValue Op1 = Op.getOperand(2);
27654 SDValue Op2 = Op.getOperand(3);
27655 MVT VT = Op2.getSimpleValueType();
27656 unsigned Opc = 0;
27657 switch (IntNo) {
27658 default:
27659 llvm_unreachable("Unknown Intrinsic");
27660 case Intrinsic::x86_aadd32:
27661 case Intrinsic::x86_aadd64:
27662 Opc = X86ISD::AADD;
27663 break;
27664 case Intrinsic::x86_aand32:
27665 case Intrinsic::x86_aand64:
27666 Opc = X86ISD::AAND;
27667 break;
27668 case Intrinsic::x86_aor32:
27669 case Intrinsic::x86_aor64:
27670 Opc = X86ISD::AOR;
27671 break;
27672 case Intrinsic::x86_axor32:
27673 case Intrinsic::x86_axor64:
27674 Opc = X86ISD::AXOR;
27675 break;
27676 }
27677 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27678 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27679 {Chain, Op1, Op2}, VT, MMO);
27680 }
27681 case Intrinsic::x86_atomic_add_cc:
27682 case Intrinsic::x86_atomic_sub_cc:
27683 case Intrinsic::x86_atomic_or_cc:
27684 case Intrinsic::x86_atomic_and_cc:
27685 case Intrinsic::x86_atomic_xor_cc: {
27686 SDLoc DL(Op);
27687 SDValue Chain = Op.getOperand(0);
27688 SDValue Op1 = Op.getOperand(2);
27689 SDValue Op2 = Op.getOperand(3);
27690 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27691 MVT VT = Op2.getSimpleValueType();
27692 unsigned Opc = 0;
27693 switch (IntNo) {
27694 default:
27695 llvm_unreachable("Unknown Intrinsic");
27696 case Intrinsic::x86_atomic_add_cc:
27697 Opc = X86ISD::LADD;
27698 break;
27699 case Intrinsic::x86_atomic_sub_cc:
27700 Opc = X86ISD::LSUB;
27701 break;
27702 case Intrinsic::x86_atomic_or_cc:
27703 Opc = X86ISD::LOR;
27704 break;
27705 case Intrinsic::x86_atomic_and_cc:
27706 Opc = X86ISD::LAND;
27707 break;
27708 case Intrinsic::x86_atomic_xor_cc:
27709 Opc = X86ISD::LXOR;
27710 break;
27711 }
27712 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27713 SDValue LockArith =
27714 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27715 {Chain, Op1, Op2}, VT, MMO);
27716 Chain = LockArith.getValue(1);
27717 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27718 }
27719 }
27720 return SDValue();
27721 }
27722
27723 SDLoc dl(Op);
27724 switch(IntrData->Type) {
27725 default: llvm_unreachable("Unknown Intrinsic Type");
27726 case RDSEED:
27727 case RDRAND: {
27728 // Emit the node with the right value type.
27729 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27730 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27731
27732 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27733 // Otherwise return the value from Rand, which is always 0, casted to i32.
27734 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27735 DAG.getConstant(1, dl, Op->getValueType(1)),
27736 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27737 SDValue(Result.getNode(), 1)};
27738 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27739
27740 // Return { result, isValid, chain }.
27741 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27742 SDValue(Result.getNode(), 2));
27743 }
27744 case GATHER_AVX2: {
27745 SDValue Chain = Op.getOperand(0);
27746 SDValue Src = Op.getOperand(2);
27747 SDValue Base = Op.getOperand(3);
27748 SDValue Index = Op.getOperand(4);
27749 SDValue Mask = Op.getOperand(5);
27750 SDValue Scale = Op.getOperand(6);
27751 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27752 Scale, Chain, Subtarget);
27753 }
27754 case GATHER: {
27755 //gather(v1, mask, index, base, scale);
27756 SDValue Chain = Op.getOperand(0);
27757 SDValue Src = Op.getOperand(2);
27758 SDValue Base = Op.getOperand(3);
27759 SDValue Index = Op.getOperand(4);
27760 SDValue Mask = Op.getOperand(5);
27761 SDValue Scale = Op.getOperand(6);
27762 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27763 Chain, Subtarget);
27764 }
27765 case SCATTER: {
27766 //scatter(base, mask, index, v1, scale);
27767 SDValue Chain = Op.getOperand(0);
27768 SDValue Base = Op.getOperand(2);
27769 SDValue Mask = Op.getOperand(3);
27770 SDValue Index = Op.getOperand(4);
27771 SDValue Src = Op.getOperand(5);
27772 SDValue Scale = Op.getOperand(6);
27773 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27774 Scale, Chain, Subtarget);
27775 }
27776 case PREFETCH: {
27777 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27778 assert((HintVal == 2 || HintVal == 3) &&
27779 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27780 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27781 SDValue Chain = Op.getOperand(0);
27782 SDValue Mask = Op.getOperand(2);
27783 SDValue Index = Op.getOperand(3);
27784 SDValue Base = Op.getOperand(4);
27785 SDValue Scale = Op.getOperand(5);
27786 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27787 Subtarget);
27788 }
27789 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27790 case RDTSC: {
27792 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27793 Results);
27794 return DAG.getMergeValues(Results, dl);
27795 }
27796 // Read Performance Monitoring Counters.
27797 case RDPMC:
27798 // Read Processor Register.
27799 case RDPRU:
27800 // GetExtended Control Register.
27801 case XGETBV: {
27803
27804 // RDPMC uses ECX to select the index of the performance counter to read.
27805 // RDPRU uses ECX to select the processor register to read.
27806 // XGETBV uses ECX to select the index of the XCR register to return.
27807 // The result is stored into registers EDX:EAX.
27808 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27809 Subtarget, Results);
27810 return DAG.getMergeValues(Results, dl);
27811 }
27812 // XTEST intrinsics.
27813 case XTEST: {
27814 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27815 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27816
27817 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27818 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27819 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27820 Ret, SDValue(InTrans.getNode(), 1));
27821 }
27824 case TRUNCATE_TO_MEM_VI32: {
27825 SDValue Mask = Op.getOperand(4);
27826 SDValue DataToTruncate = Op.getOperand(3);
27827 SDValue Addr = Op.getOperand(2);
27828 SDValue Chain = Op.getOperand(0);
27829
27830 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27831 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27832
27833 EVT MemVT = MemIntr->getMemoryVT();
27834
27835 uint16_t TruncationOp = IntrData->Opc0;
27836 switch (TruncationOp) {
27837 case X86ISD::VTRUNC: {
27838 if (isAllOnesConstant(Mask)) // return just a truncate store
27839 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27840 MemIntr->getMemOperand());
27841
27842 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27843 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27844 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27845
27846 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27847 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27848 true /* truncating */);
27849 }
27850 case X86ISD::VTRUNCUS:
27851 case X86ISD::VTRUNCS: {
27852 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27853 if (isAllOnesConstant(Mask))
27854 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27855 MemIntr->getMemOperand(), DAG);
27856
27857 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27858 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27859
27860 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27861 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27862 }
27863 default:
27864 llvm_unreachable("Unsupported truncstore intrinsic");
27865 }
27866 }
27867 case INTR_TYPE_CAST_MMX:
27868 return SDValue(); // handled in combineINTRINSIC_*
27869 }
27870}
27871
27872SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27873 SelectionDAG &DAG) const {
27875 MFI.setReturnAddressIsTaken(true);
27876
27878 return SDValue();
27879
27880 unsigned Depth = Op.getConstantOperandVal(0);
27881 SDLoc dl(Op);
27882 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27883
27884 if (Depth > 0) {
27885 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27886 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27887 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27888 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27889 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27891 }
27892
27893 // Just load the return address.
27894 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27895 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27897}
27898
27899SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27900 SelectionDAG &DAG) const {
27902 return getReturnAddressFrameIndex(DAG);
27903}
27904
27905SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27907 MachineFrameInfo &MFI = MF.getFrameInfo();
27909 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27910 EVT VT = Op.getValueType();
27911
27912 MFI.setFrameAddressIsTaken(true);
27913
27914 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27915 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27916 // is not possible to crawl up the stack without looking at the unwind codes
27917 // simultaneously.
27918 int FrameAddrIndex = FuncInfo->getFAIndex();
27919 if (!FrameAddrIndex) {
27920 // Set up a frame object for the return address.
27921 unsigned SlotSize = RegInfo->getSlotSize();
27922 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27923 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27924 FuncInfo->setFAIndex(FrameAddrIndex);
27925 }
27926 return DAG.getFrameIndex(FrameAddrIndex, VT);
27927 }
27928
27929 unsigned FrameReg =
27930 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27931 SDLoc dl(Op); // FIXME probably not meaningful
27932 unsigned Depth = Op.getConstantOperandVal(0);
27933 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27934 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27935 "Invalid Frame Register!");
27936 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27937 while (Depth--)
27938 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27940 return FrameAddr;
27941}
27942
27943// FIXME? Maybe this could be a TableGen attribute on some registers and
27944// this table could be generated automatically from RegInfo.
27946 const MachineFunction &MF) const {
27947 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27948
27950 .Case("esp", X86::ESP)
27951 .Case("rsp", X86::RSP)
27952 .Case("ebp", X86::EBP)
27953 .Case("rbp", X86::RBP)
27954 .Case("r14", X86::R14)
27955 .Case("r15", X86::R15)
27956 .Default(0);
27957
27958 if (Reg == X86::EBP || Reg == X86::RBP) {
27959 if (!TFI.hasFP(MF))
27960 report_fatal_error("register " + StringRef(RegName) +
27961 " is allocatable: function has no frame pointer");
27962#ifndef NDEBUG
27963 else {
27964 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27965 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27966 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27967 "Invalid Frame Register!");
27968 }
27969#endif
27970 }
27971
27972 if (Reg)
27973 return Reg;
27974
27975 report_fatal_error("Invalid register name global variable");
27976}
27977
27978SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27979 SelectionDAG &DAG) const {
27980 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27981 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27982}
27983
27985 const Constant *PersonalityFn) const {
27986 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27987 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27988
27989 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27990}
27991
27993 const Constant *PersonalityFn) const {
27994 // Funclet personalities don't use selectors (the runtime does the selection).
27996 return X86::NoRegister;
27997 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27998}
27999
28001 return Subtarget.isTargetWin64();
28002}
28003
28004SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28005 SDValue Chain = Op.getOperand(0);
28006 SDValue Offset = Op.getOperand(1);
28007 SDValue Handler = Op.getOperand(2);
28008 SDLoc dl (Op);
28009
28010 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28011 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28012 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28013 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28014 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28015 "Invalid Frame Register!");
28016 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28017 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28018
28019 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28020 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28021 dl));
28022 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28023 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28024 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28025
28026 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28027 DAG.getRegister(StoreAddrReg, PtrVT));
28028}
28029
28030SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28031 SelectionDAG &DAG) const {
28032 SDLoc DL(Op);
28033 // If the subtarget is not 64bit, we may need the global base reg
28034 // after isel expand pseudo, i.e., after CGBR pass ran.
28035 // Therefore, ask for the GlobalBaseReg now, so that the pass
28036 // inserts the code for us in case we need it.
28037 // Otherwise, we will end up in a situation where we will
28038 // reference a virtual register that is not defined!
28039 if (!Subtarget.is64Bit()) {
28040 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28041 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28042 }
28043 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28044 DAG.getVTList(MVT::i32, MVT::Other),
28045 Op.getOperand(0), Op.getOperand(1));
28046}
28047
28048SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28049 SelectionDAG &DAG) const {
28050 SDLoc DL(Op);
28051 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28052 Op.getOperand(0), Op.getOperand(1));
28053}
28054
28055SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28056 SelectionDAG &DAG) const {
28057 SDLoc DL(Op);
28058 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28059 Op.getOperand(0));
28060}
28061
28063 return Op.getOperand(0);
28064}
28065
28066SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28067 SelectionDAG &DAG) const {
28068 SDValue Root = Op.getOperand(0);
28069 SDValue Trmp = Op.getOperand(1); // trampoline
28070 SDValue FPtr = Op.getOperand(2); // nested function
28071 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28072 SDLoc dl (Op);
28073
28074 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28075 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28076
28077 if (Subtarget.is64Bit()) {
28078 SDValue OutChains[6];
28079
28080 // Large code-model.
28081 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28082 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28083
28084 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28085 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28086
28087 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28088
28089 // Load the pointer to the nested function into R11.
28090 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28091 SDValue Addr = Trmp;
28092 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28093 Addr, MachinePointerInfo(TrmpAddr));
28094
28095 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28096 DAG.getConstant(2, dl, MVT::i64));
28097 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28098 MachinePointerInfo(TrmpAddr, 2), Align(2));
28099
28100 // Load the 'nest' parameter value into R10.
28101 // R10 is specified in X86CallingConv.td
28102 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28103 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28104 DAG.getConstant(10, dl, MVT::i64));
28105 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28106 Addr, MachinePointerInfo(TrmpAddr, 10));
28107
28108 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28109 DAG.getConstant(12, dl, MVT::i64));
28110 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28111 MachinePointerInfo(TrmpAddr, 12), Align(2));
28112
28113 // Jump to the nested function.
28114 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28115 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28116 DAG.getConstant(20, dl, MVT::i64));
28117 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28118 Addr, MachinePointerInfo(TrmpAddr, 20));
28119
28120 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28121 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28122 DAG.getConstant(22, dl, MVT::i64));
28123 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28124 Addr, MachinePointerInfo(TrmpAddr, 22));
28125
28126 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28127 } else {
28128 const Function *Func =
28129 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28130 CallingConv::ID CC = Func->getCallingConv();
28131 unsigned NestReg;
28132
28133 switch (CC) {
28134 default:
28135 llvm_unreachable("Unsupported calling convention");
28136 case CallingConv::C:
28138 // Pass 'nest' parameter in ECX.
28139 // Must be kept in sync with X86CallingConv.td
28140 NestReg = X86::ECX;
28141
28142 // Check that ECX wasn't needed by an 'inreg' parameter.
28143 FunctionType *FTy = Func->getFunctionType();
28144 const AttributeList &Attrs = Func->getAttributes();
28145
28146 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28147 unsigned InRegCount = 0;
28148 unsigned Idx = 0;
28149
28150 for (FunctionType::param_iterator I = FTy->param_begin(),
28151 E = FTy->param_end(); I != E; ++I, ++Idx)
28152 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28153 const DataLayout &DL = DAG.getDataLayout();
28154 // FIXME: should only count parameters that are lowered to integers.
28155 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28156 }
28157
28158 if (InRegCount > 2) {
28159 report_fatal_error("Nest register in use - reduce number of inreg"
28160 " parameters!");
28161 }
28162 }
28163 break;
28164 }
28167 case CallingConv::Fast:
28168 case CallingConv::Tail:
28170 // Pass 'nest' parameter in EAX.
28171 // Must be kept in sync with X86CallingConv.td
28172 NestReg = X86::EAX;
28173 break;
28174 }
28175
28176 SDValue OutChains[4];
28177 SDValue Addr, Disp;
28178
28179 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28180 DAG.getConstant(10, dl, MVT::i32));
28181 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28182
28183 // This is storing the opcode for MOV32ri.
28184 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28185 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28186 OutChains[0] =
28187 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28188 Trmp, MachinePointerInfo(TrmpAddr));
28189
28190 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28191 DAG.getConstant(1, dl, MVT::i32));
28192 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28193 MachinePointerInfo(TrmpAddr, 1), Align(1));
28194
28195 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28196 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28197 DAG.getConstant(5, dl, MVT::i32));
28198 OutChains[2] =
28199 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28200 MachinePointerInfo(TrmpAddr, 5), Align(1));
28201
28202 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28203 DAG.getConstant(6, dl, MVT::i32));
28204 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28205 MachinePointerInfo(TrmpAddr, 6), Align(1));
28206
28207 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28208 }
28209}
28210
28211SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28212 SelectionDAG &DAG) const {
28213 /*
28214 The rounding mode is in bits 11:10 of FPSR, and has the following
28215 settings:
28216 00 Round to nearest
28217 01 Round to -inf
28218 10 Round to +inf
28219 11 Round to 0
28220
28221 GET_ROUNDING, on the other hand, expects the following:
28222 -1 Undefined
28223 0 Round to 0
28224 1 Round to nearest
28225 2 Round to +inf
28226 3 Round to -inf
28227
28228 To perform the conversion, we use a packed lookup table of the four 2-bit
28229 values that we can index by FPSP[11:10]
28230 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28231
28232 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28233 */
28234
28236 MVT VT = Op.getSimpleValueType();
28237 SDLoc DL(Op);
28238
28239 // Save FP Control Word to stack slot
28240 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28241 SDValue StackSlot =
28242 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28243
28245
28246 SDValue Chain = Op.getOperand(0);
28247 SDValue Ops[] = {Chain, StackSlot};
28249 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28251
28252 // Load FP Control Word from stack slot
28253 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28254 Chain = CWD.getValue(1);
28255
28256 // Mask and turn the control bits into a shift for the lookup table.
28257 SDValue Shift =
28258 DAG.getNode(ISD::SRL, DL, MVT::i16,
28259 DAG.getNode(ISD::AND, DL, MVT::i16,
28260 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28261 DAG.getConstant(9, DL, MVT::i8));
28262 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28263
28264 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28265 SDValue RetVal =
28266 DAG.getNode(ISD::AND, DL, MVT::i32,
28267 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28268 DAG.getConstant(3, DL, MVT::i32));
28269
28270 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28271
28272 return DAG.getMergeValues({RetVal, Chain}, DL);
28273}
28274
28275SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28276 SelectionDAG &DAG) const {
28278 SDLoc DL(Op);
28279 SDValue Chain = Op.getNode()->getOperand(0);
28280
28281 // FP control word may be set only from data in memory. So we need to allocate
28282 // stack space to save/load FP control word.
28283 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28284 SDValue StackSlot =
28285 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28287 MachineMemOperand *MMO =
28289
28290 // Store FP control word into memory.
28291 SDValue Ops[] = {Chain, StackSlot};
28292 Chain = DAG.getMemIntrinsicNode(
28293 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28294
28295 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28296 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28297 Chain = CWD.getValue(1);
28298 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28299 DAG.getConstant(0xf3ff, DL, MVT::i16));
28300
28301 // Calculate new rounding mode.
28302 SDValue NewRM = Op.getNode()->getOperand(1);
28303 SDValue RMBits;
28304 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28305 uint64_t RM = CVal->getZExtValue();
28306 int FieldVal;
28307 switch (static_cast<RoundingMode>(RM)) {
28308 // clang-format off
28309 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28310 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28311 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28312 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28313 default:
28314 llvm_unreachable("rounding mode is not supported by X86 hardware");
28315 // clang-format on
28316 }
28317 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28318 } else {
28319 // Need to convert argument into bits of control word:
28320 // 0 Round to 0 -> 11
28321 // 1 Round to nearest -> 00
28322 // 2 Round to +inf -> 10
28323 // 3 Round to -inf -> 01
28324 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28325 // To make the conversion, put all these values into a value 0xc9 and shift
28326 // it left depending on the rounding mode:
28327 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28328 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28329 // ...
28330 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28331 SDValue ShiftValue =
28332 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28333 DAG.getNode(ISD::ADD, DL, MVT::i32,
28334 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28335 DAG.getConstant(1, DL, MVT::i8)),
28336 DAG.getConstant(4, DL, MVT::i32)));
28337 SDValue Shifted =
28338 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28339 ShiftValue);
28340 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28341 DAG.getConstant(0xc00, DL, MVT::i16));
28342 }
28343
28344 // Update rounding mode bits and store the new FP Control Word into stack.
28345 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28346 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28347
28348 // Load FP control word from the slot.
28349 SDValue OpsLD[] = {Chain, StackSlot};
28350 MachineMemOperand *MMOL =
28352 Chain = DAG.getMemIntrinsicNode(
28353 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28354
28355 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28356 // same way but in bits 14:13.
28357 if (Subtarget.hasSSE1()) {
28358 // Store MXCSR into memory.
28359 Chain = DAG.getNode(
28360 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28361 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28362 StackSlot);
28363
28364 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28365 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28366 Chain = CWD.getValue(1);
28367 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28368 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28369
28370 // Shift X87 RM bits from 11:10 to 14:13.
28371 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28372 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28373 DAG.getConstant(3, DL, MVT::i8));
28374
28375 // Update rounding mode bits and store the new FP Control Word into stack.
28376 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28377 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28378
28379 // Load MXCSR from the slot.
28380 Chain = DAG.getNode(
28381 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28382 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28383 StackSlot);
28384 }
28385
28386 return Chain;
28387}
28388
28389const unsigned X87StateSize = 28;
28390const unsigned FPStateSize = 32;
28391[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28392
28393SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28394 SelectionDAG &DAG) const {
28396 SDLoc DL(Op);
28397 SDValue Chain = Op->getOperand(0);
28398 SDValue Ptr = Op->getOperand(1);
28399 auto *Node = cast<FPStateAccessSDNode>(Op);
28400 EVT MemVT = Node->getMemoryVT();
28402 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28403
28404 // Get x87 state, if it presents.
28405 if (Subtarget.hasX87()) {
28406 Chain =
28407 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28408 {Chain, Ptr}, MemVT, MMO);
28409
28410 // FNSTENV changes the exception mask, so load back the stored environment.
28411 MachineMemOperand::Flags NewFlags =
28413 (MMO->getFlags() & ~MachineMemOperand::MOStore);
28414 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28415 Chain =
28416 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28417 {Chain, Ptr}, MemVT, MMO);
28418 }
28419
28420 // If target supports SSE, get MXCSR as well.
28421 if (Subtarget.hasSSE1()) {
28422 // Get pointer to the MXCSR location in memory.
28424 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28425 DAG.getConstant(X87StateSize, DL, PtrVT));
28426 // Store MXCSR into memory.
28427 Chain = DAG.getNode(
28428 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28429 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28430 MXCSRAddr);
28431 }
28432
28433 return Chain;
28434}
28435
28437 EVT MemVT, MachineMemOperand *MMO,
28438 SelectionDAG &DAG,
28439 const X86Subtarget &Subtarget) {
28440 // Set x87 state, if it presents.
28441 if (Subtarget.hasX87())
28442 Chain =
28443 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28444 {Chain, Ptr}, MemVT, MMO);
28445 // If target supports SSE, set MXCSR as well.
28446 if (Subtarget.hasSSE1()) {
28447 // Get pointer to the MXCSR location in memory.
28449 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28450 DAG.getConstant(X87StateSize, DL, PtrVT));
28451 // Load MXCSR from memory.
28452 Chain = DAG.getNode(
28453 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28454 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28455 MXCSRAddr);
28456 }
28457 return Chain;
28458}
28459
28460SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28461 SelectionDAG &DAG) const {
28462 SDLoc DL(Op);
28463 SDValue Chain = Op->getOperand(0);
28464 SDValue Ptr = Op->getOperand(1);
28465 auto *Node = cast<FPStateAccessSDNode>(Op);
28466 EVT MemVT = Node->getMemoryVT();
28468 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28469 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28470}
28471
28472SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28473 SelectionDAG &DAG) const {
28475 SDLoc DL(Op);
28476 SDValue Chain = Op.getNode()->getOperand(0);
28477
28478 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28479 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28481
28482 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28483 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28484 // for compatibility with glibc.
28485 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28486 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28487 Constant *Zero = ConstantInt::get(ItemTy, 0);
28488 for (unsigned I = 0; I < 6; ++I)
28489 FPEnvVals.push_back(Zero);
28490
28491 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28492 // all exceptions, sets DAZ and FTZ to 0.
28493 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28494 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28496 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28497 MachinePointerInfo MPI =
28501
28502 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28503}
28504
28505// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28506uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28507 assert((Amt < 8) && "Shift/Rotation amount out of range");
28508 switch (Opcode) {
28509 case ISD::BITREVERSE:
28510 return 0x8040201008040201ULL;
28511 case ISD::SHL:
28512 return ((0x0102040810204080ULL >> (Amt)) &
28513 (0x0101010101010101ULL * (0xFF >> (Amt))));
28514 case ISD::SRL:
28515 return ((0x0102040810204080ULL << (Amt)) &
28516 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28517 case ISD::SRA:
28518 return (getGFNICtrlImm(ISD::SRL, Amt) |
28519 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28520 case ISD::ROTL:
28521 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28522 case ISD::ROTR:
28523 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28524 }
28525 llvm_unreachable("Unsupported GFNI opcode");
28526}
28527
28528// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28529SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28530 MVT VT, unsigned Amt = 0) {
28531 assert(VT.getVectorElementType() == MVT::i8 &&
28532 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28533 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28534 SmallVector<SDValue> MaskBits;
28535 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28536 uint64_t Bits = (Imm >> (I % 64)) & 255;
28537 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28538 }
28539 return DAG.getBuildVector(VT, DL, MaskBits);
28540}
28541
28542/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28543//
28544// i8/i16 vector implemented using dword LZCNT vector instruction
28545// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28546// split the vector, perform operation on it's Lo a Hi part and
28547// concatenate the results.
28549 const X86Subtarget &Subtarget) {
28550 assert(Op.getOpcode() == ISD::CTLZ);
28551 SDLoc dl(Op);
28552 MVT VT = Op.getSimpleValueType();
28553 MVT EltVT = VT.getVectorElementType();
28554 unsigned NumElems = VT.getVectorNumElements();
28555
28556 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28557 "Unsupported element type");
28558
28559 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28560 if (NumElems > 16 ||
28561 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28562 return splitVectorIntUnary(Op, DAG, dl);
28563
28564 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28565 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28566 "Unsupported value type for operation");
28567
28568 // Use native supported vector instruction vplzcntd.
28569 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28570 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28571 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28572 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28573
28574 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28575}
28576
28577// Lower CTLZ using a PSHUFB lookup table implementation.
28579 const X86Subtarget &Subtarget,
28580 SelectionDAG &DAG) {
28581 MVT VT = Op.getSimpleValueType();
28582 int NumElts = VT.getVectorNumElements();
28583 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28584 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28585
28586 // Per-nibble leading zero PSHUFB lookup table.
28587 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28588 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28589 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28590 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28591
28593 for (int i = 0; i < NumBytes; ++i)
28594 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28595 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28596
28597 // Begin by bitcasting the input to byte vector, then split those bytes
28598 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28599 // If the hi input nibble is zero then we add both results together, otherwise
28600 // we just take the hi result (by masking the lo result to zero before the
28601 // add).
28602 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28603 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28604
28605 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28606 SDValue Lo = Op0;
28607 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28608 SDValue HiZ;
28609 if (CurrVT.is512BitVector()) {
28610 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28611 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28612 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28613 } else {
28614 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28615 }
28616
28617 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
28618 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
28619 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
28620 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
28621
28622 // Merge result back from vXi8 back to VT, working on the lo/hi halves
28623 // of the current vector width in the same way we did for the nibbles.
28624 // If the upper half of the input element is zero then add the halves'
28625 // leading zero counts together, otherwise just use the upper half's.
28626 // Double the width of the result until we are at target width.
28627 while (CurrVT != VT) {
28628 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
28629 int CurrNumElts = CurrVT.getVectorNumElements();
28630 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
28631 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
28632 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
28633
28634 // Check if the upper half of the input element is zero.
28635 if (CurrVT.is512BitVector()) {
28636 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28637 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
28638 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28639 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28640 } else {
28641 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
28642 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
28643 }
28644 HiZ = DAG.getBitcast(NextVT, HiZ);
28645
28646 // Move the upper/lower halves to the lower bits as we'll be extending to
28647 // NextVT. Mask the lower result to zero if HiZ is true and add the results
28648 // together.
28649 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
28650 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
28651 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
28652 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
28653 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
28654 CurrVT = NextVT;
28655 }
28656
28657 return Res;
28658}
28659
28661 const X86Subtarget &Subtarget,
28662 SelectionDAG &DAG) {
28663 MVT VT = Op.getSimpleValueType();
28664
28665 if (Subtarget.hasCDI() &&
28666 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28667 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28668 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28669
28670 // Decompose 256-bit ops into smaller 128-bit ops.
28671 if (VT.is256BitVector() && !Subtarget.hasInt256())
28672 return splitVectorIntUnary(Op, DAG, DL);
28673
28674 // Decompose 512-bit ops into smaller 256-bit ops.
28675 if (VT.is512BitVector() && !Subtarget.hasBWI())
28676 return splitVectorIntUnary(Op, DAG, DL);
28677
28678 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28679 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28680}
28681
28682static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28683 SelectionDAG &DAG) {
28684 MVT VT = Op.getSimpleValueType();
28685 MVT OpVT = VT;
28686 unsigned NumBits = VT.getSizeInBits();
28687 SDLoc dl(Op);
28688 unsigned Opc = Op.getOpcode();
28689
28690 if (VT.isVector())
28691 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28692
28693 Op = Op.getOperand(0);
28694 if (VT == MVT::i8) {
28695 // Zero extend to i32 since there is not an i8 bsr.
28696 OpVT = MVT::i32;
28697 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28698 }
28699
28700 // Check if we can safely pass a result though BSR for zero sources.
28701 SDValue PassThru = DAG.getUNDEF(OpVT);
28702 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
28703 !DAG.isKnownNeverZero(Op))
28704 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
28705
28706 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28707 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28708 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
28709
28710 // Skip CMOV if we're using a pass through value.
28711 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
28712 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28713 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28714 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28715 Op.getValue(1)};
28716 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28717 }
28718
28719 // Finally xor with NumBits-1.
28720 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28721 DAG.getConstant(NumBits - 1, dl, OpVT));
28722
28723 if (VT == MVT::i8)
28724 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28725 return Op;
28726}
28727
28728static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28729 SelectionDAG &DAG) {
28730 MVT VT = Op.getSimpleValueType();
28731 unsigned NumBits = VT.getScalarSizeInBits();
28732 SDValue N0 = Op.getOperand(0);
28733 SDLoc dl(Op);
28734 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
28735
28736 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28737 "Only scalar CTTZ requires custom lowering");
28738
28739 // Check if we can safely pass a result though BSF for zero sources.
28740 SDValue PassThru = DAG.getUNDEF(VT);
28741 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
28742 PassThru = DAG.getConstant(NumBits, dl, VT);
28743
28744 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28745 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28746 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
28747
28748 // Skip CMOV if src is never zero or we're using a pass through value.
28749 if (NonZeroSrc || !PassThru.isUndef())
28750 return Op;
28751
28752 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28753 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28754 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28755 Op.getValue(1)};
28756 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28757}
28758
28760 const X86Subtarget &Subtarget) {
28761 MVT VT = Op.getSimpleValueType();
28762 SDLoc DL(Op);
28763
28764 if (VT == MVT::i16 || VT == MVT::i32)
28765 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28766
28767 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28768 return splitVectorIntBinary(Op, DAG, DL);
28769
28770 assert(Op.getSimpleValueType().is256BitVector() &&
28771 Op.getSimpleValueType().isInteger() &&
28772 "Only handle AVX 256-bit vector integer operation");
28773 return splitVectorIntBinary(Op, DAG, DL);
28774}
28775
28777 const X86Subtarget &Subtarget) {
28778 MVT VT = Op.getSimpleValueType();
28779 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28780 unsigned Opcode = Op.getOpcode();
28781 SDLoc DL(Op);
28782
28783 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28784 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28785 assert(Op.getSimpleValueType().isInteger() &&
28786 "Only handle AVX vector integer operation");
28787 return splitVectorIntBinary(Op, DAG, DL);
28788 }
28789
28790 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28792 EVT SetCCResultType =
28793 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28794
28795 unsigned BitWidth = VT.getScalarSizeInBits();
28796 if (Opcode == ISD::USUBSAT) {
28797 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28798 // Handle a special-case with a bit-hack instead of cmp+select:
28799 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28800 // If the target can use VPTERNLOG, DAGToDAG will match this as
28801 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28802 // "broadcast" constant load.
28804 if (C && C->getAPIntValue().isSignMask()) {
28805 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28806 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28807 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28808 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28809 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28810 }
28811 }
28812 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28813 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28814 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28815 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28816 // TODO: Move this to DAGCombiner?
28817 if (SetCCResultType == VT &&
28818 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28819 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28820 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28821 }
28822 }
28823
28824 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28825 (!VT.isVector() || VT == MVT::v2i64)) {
28828 SDValue Zero = DAG.getConstant(0, DL, VT);
28829 SDValue Result =
28830 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28831 DAG.getVTList(VT, SetCCResultType), X, Y);
28832 SDValue SumDiff = Result.getValue(0);
28833 SDValue Overflow = Result.getValue(1);
28834 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28835 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28836 SDValue SumNeg =
28837 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28838 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28839 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28840 }
28841
28842 // Use default expansion.
28843 return SDValue();
28844}
28845
28846static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28847 SelectionDAG &DAG) {
28848 MVT VT = Op.getSimpleValueType();
28849 SDLoc DL(Op);
28850
28851 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28852 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28853 // 8-bit integer abs to NEG and CMOV.
28854 SDValue N0 = Op.getOperand(0);
28855 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28856 DAG.getConstant(0, DL, VT), N0);
28857 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28858 SDValue(Neg.getNode(), 1)};
28859 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28860 }
28861
28862 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28863 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28864 SDValue Src = Op.getOperand(0);
28865 SDValue Neg = DAG.getNegative(Src, DL, VT);
28866 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28867 }
28868
28869 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28870 assert(VT.isInteger() &&
28871 "Only handle AVX 256-bit vector integer operation");
28872 return splitVectorIntUnary(Op, DAG, DL);
28873 }
28874
28875 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28876 return splitVectorIntUnary(Op, DAG, DL);
28877
28878 // Default to expand.
28879 return SDValue();
28880}
28881
28882static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28883 SelectionDAG &DAG) {
28884 MVT VT = Op.getSimpleValueType();
28885 SDLoc DL(Op);
28886
28887 // For AVX1 cases, split to use legal ops.
28888 if (VT.is256BitVector() && !Subtarget.hasInt256())
28889 return splitVectorIntBinary(Op, DAG, DL);
28890
28891 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28892 return splitVectorIntBinary(Op, DAG, DL);
28893
28894 // Default to expand.
28895 return SDValue();
28896}
28897
28898static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28899 SelectionDAG &DAG) {
28900 MVT VT = Op.getSimpleValueType();
28901 SDLoc DL(Op);
28902
28903 // For AVX1 cases, split to use legal ops.
28904 if (VT.is256BitVector() && !Subtarget.hasInt256())
28905 return splitVectorIntBinary(Op, DAG, DL);
28906
28907 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28908 return splitVectorIntBinary(Op, DAG, DL);
28909
28910 // Default to expand.
28911 return SDValue();
28912}
28913
28915 SelectionDAG &DAG) {
28916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28917 EVT VT = Op.getValueType();
28918 SDValue X = Op.getOperand(0);
28919 SDValue Y = Op.getOperand(1);
28920 SDLoc DL(Op);
28921 bool IsMaxOp =
28922 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
28923 bool IsNum =
28924 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
28925 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
28926 unsigned Opc = 0;
28927 if (VT.isVector())
28928 Opc = X86ISD::VMINMAX;
28929 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
28930 Opc = X86ISD::VMINMAXS;
28931
28932 if (Opc) {
28933 SDValue Imm =
28934 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
28935 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
28936 }
28937 }
28938
28939 uint64_t SizeInBits = VT.getScalarSizeInBits();
28940 APInt PreferredZero = APInt::getZero(SizeInBits);
28941 APInt OppositeZero = PreferredZero;
28942 EVT IVT = VT.changeTypeToInteger();
28943 X86ISD::NodeType MinMaxOp;
28944 if (IsMaxOp) {
28945 MinMaxOp = X86ISD::FMAX;
28946 OppositeZero.setSignBit();
28947 } else {
28948 PreferredZero.setSignBit();
28949 MinMaxOp = X86ISD::FMIN;
28950 }
28951 EVT SetCCType =
28952 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28953
28954 // The tables below show the expected result of Max in cases of NaN and
28955 // signed zeros.
28956 //
28957 // Y Y
28958 // Num xNaN +0 -0
28959 // --------------- ---------------
28960 // Num | Max | Y | +0 | +0 | +0 |
28961 // X --------------- X ---------------
28962 // xNaN | X | X/Y | -0 | +0 | -0 |
28963 // --------------- ---------------
28964 //
28965 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28966 // reordering.
28967 //
28968 // We check if any of operands is NaN and return NaN. Then we check if any of
28969 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28970 // to ensure the correct zero is returned.
28971 auto MatchesZero = [](SDValue Op, APInt Zero) {
28973 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28974 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28975 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28976 return CstOp->getAPIntValue() == Zero;
28977 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28978 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28979 for (const SDValue &OpVal : Op->op_values()) {
28980 if (OpVal.isUndef())
28981 continue;
28982 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28983 if (!CstOp)
28984 return false;
28985 if (!CstOp->getValueAPF().isZero())
28986 continue;
28987 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28988 return false;
28989 }
28990 return true;
28991 }
28992 return false;
28993 };
28994
28995 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28996 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28997 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28998 Op->getFlags().hasNoSignedZeros() ||
28999 DAG.isKnownNeverZeroFloat(X) ||
29001 SDValue NewX, NewY;
29002 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29003 MatchesZero(X, OppositeZero)) {
29004 // Operands are already in right order or order does not matter.
29005 NewX = X;
29006 NewY = Y;
29007 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29008 NewX = Y;
29009 NewY = X;
29010 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29011 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29012 if (IsXNeverNaN)
29013 std::swap(X, Y);
29014 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29015 // xmm register.
29016 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29018 // Bits of classes:
29019 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29020 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29021 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29022 DL, MVT::i32);
29023 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29024 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29025 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29026 DAG.getVectorIdxConstant(0, DL));
29027 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29028 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29029 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29030 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29031 } else {
29032 SDValue IsXSigned;
29033 if (Subtarget.is64Bit() || VT != MVT::f64) {
29034 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29035 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29036 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29037 } else {
29038 assert(VT == MVT::f64);
29039 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29040 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29041 DAG.getVectorIdxConstant(0, DL));
29042 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29043 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29044 DAG.getVectorIdxConstant(1, DL));
29045 Hi = DAG.getBitcast(MVT::i32, Hi);
29046 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29047 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29048 *DAG.getContext(), MVT::i32);
29049 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29050 }
29051 if (MinMaxOp == X86ISD::FMAX) {
29052 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29053 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29054 } else {
29055 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29056 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29057 }
29058 }
29059
29060 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29061 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29062
29063 // If we did no ordering operands for signed zero handling and we need
29064 // to process NaN and we know that the second operand is not NaN then put
29065 // it in first operand and we will not need to post handle NaN after max/min.
29066 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
29067 std::swap(NewX, NewY);
29068
29069 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29070
29071 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
29072 return MinMax;
29073
29074 SDValue IsNaN =
29075 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29076
29077 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29078}
29079
29080static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29081 SelectionDAG &DAG) {
29082 MVT VT = Op.getSimpleValueType();
29083 SDLoc dl(Op);
29084
29085 // For AVX1 cases, split to use legal ops.
29086 if (VT.is256BitVector() && !Subtarget.hasInt256())
29087 return splitVectorIntBinary(Op, DAG, dl);
29088
29089 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29090 return splitVectorIntBinary(Op, DAG, dl);
29091
29092 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29094
29095 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29096 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29097 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29098
29099 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29100 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29101 if (VT.bitsGE(MVT::i32)) {
29102 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29103 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29104 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29105 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29106 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29107 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29108 DAG.getTargetConstant(CC, dl, MVT::i8),
29109 Diff1.getValue(1));
29110 }
29111
29112 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29113 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29114 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29115 MVT WideVT = MVT::getIntegerVT(WideBits);
29116 if (TLI.isTypeLegal(WideVT)) {
29117 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29118 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29119 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29120 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29121 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29122 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29123 DAG.getTargetConstant(CC, dl, MVT::i8),
29124 Diff1.getValue(1));
29125 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29126 }
29127 }
29128
29129 // Default to expand.
29130 return SDValue();
29131}
29132
29133static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29134 SelectionDAG &DAG) {
29135 SDLoc dl(Op);
29136 MVT VT = Op.getSimpleValueType();
29137
29138 // Decompose 256-bit ops into 128-bit ops.
29139 if (VT.is256BitVector() && !Subtarget.hasInt256())
29140 return splitVectorIntBinary(Op, DAG, dl);
29141
29142 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29143 return splitVectorIntBinary(Op, DAG, dl);
29144
29145 SDValue A = Op.getOperand(0);
29146 SDValue B = Op.getOperand(1);
29147
29148 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29149 // vector pairs, multiply and truncate.
29150 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29151 unsigned NumElts = VT.getVectorNumElements();
29152 unsigned NumLanes = VT.getSizeInBits() / 128;
29153 unsigned NumEltsPerLane = NumElts / NumLanes;
29154
29155 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29156 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29157 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29158 return DAG.getNode(
29159 ISD::TRUNCATE, dl, VT,
29160 DAG.getNode(ISD::MUL, dl, ExVT,
29161 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29162 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29163 }
29164
29165 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29166
29167 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29168 // Don't do this if we only need to unpack one half.
29169 if (Subtarget.hasSSSE3()) {
29170 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29171 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29172 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29173 if (BIsBuildVector) {
29174 for (auto [Idx, Val] : enumerate(B->ops())) {
29175 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29176 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29177 else
29178 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29179 }
29180 }
29181 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29182 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29183 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29184 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29185 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29186 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29187 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29188 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29189 DAG.getTargetConstant(8, dl, MVT::i8));
29190 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29191 }
29192 }
29193
29194 // Extract the lo/hi parts to any extend to i16.
29195 // We're going to mask off the low byte of each result element of the
29196 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29197 // element.
29198 SDValue Undef = DAG.getUNDEF(VT);
29199 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29200 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29201
29202 SDValue BLo, BHi;
29203 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29204 // If the RHS is a constant, manually unpackl/unpackh.
29205 SmallVector<SDValue, 16> LoOps, HiOps;
29206 for (unsigned i = 0; i != NumElts; i += 16) {
29207 for (unsigned j = 0; j != 8; ++j) {
29208 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29209 MVT::i16));
29210 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29211 MVT::i16));
29212 }
29213 }
29214
29215 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29216 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29217 } else {
29218 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29219 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29220 }
29221
29222 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29223 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29224 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29225 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29226 }
29227
29228 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29229 if (VT == MVT::v4i32) {
29230 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29231 "Should not custom lower when pmulld is available!");
29232
29233 // Extract the odd parts.
29234 static const int UnpackMask[] = {1, 1, 3, 3};
29235 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29236 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29237
29238 // Multiply the even parts.
29239 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29240 DAG.getBitcast(MVT::v2i64, A),
29241 DAG.getBitcast(MVT::v2i64, B));
29242 // Now multiply odd parts.
29243 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29244 DAG.getBitcast(MVT::v2i64, Aodds),
29245 DAG.getBitcast(MVT::v2i64, Bodds));
29246
29247 Evens = DAG.getBitcast(VT, Evens);
29248 Odds = DAG.getBitcast(VT, Odds);
29249
29250 // Merge the two vectors back together with a shuffle. This expands into 2
29251 // shuffles.
29252 static const int ShufMask[] = { 0, 4, 2, 6 };
29253 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29254 }
29255
29256 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29257 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29258 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29259
29260 // Ahi = psrlqi(a, 32);
29261 // Bhi = psrlqi(b, 32);
29262 //
29263 // AloBlo = pmuludq(a, b);
29264 // AloBhi = pmuludq(a, Bhi);
29265 // AhiBlo = pmuludq(Ahi, b);
29266 //
29267 // Hi = psllqi(AloBhi + AhiBlo, 32);
29268 // return AloBlo + Hi;
29269 KnownBits AKnown = DAG.computeKnownBits(A);
29270 KnownBits BKnown = DAG.computeKnownBits(B);
29271
29272 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29273 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29274 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29275
29276 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29277 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29278 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29279
29280 SDValue Zero = DAG.getConstant(0, dl, VT);
29281
29282 // Only multiply lo/hi halves that aren't known to be zero.
29283 SDValue AloBlo = Zero;
29284 if (!ALoIsZero && !BLoIsZero)
29285 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29286
29287 SDValue AloBhi = Zero;
29288 if (!ALoIsZero && !BHiIsZero) {
29289 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29290 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29291 }
29292
29293 SDValue AhiBlo = Zero;
29294 if (!AHiIsZero && !BLoIsZero) {
29295 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29296 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29297 }
29298
29299 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29300 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29301
29302 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29303}
29304
29306 MVT VT, bool IsSigned,
29307 const X86Subtarget &Subtarget,
29308 SelectionDAG &DAG,
29309 SDValue *Low = nullptr) {
29310 unsigned NumElts = VT.getVectorNumElements();
29311
29312 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29313 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29314 // lane results back together.
29315
29316 // We'll take different approaches for signed and unsigned.
29317 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29318 // and use pmullw to calculate the full 16-bit product.
29319 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29320 // shift them left into the upper byte of each word. This allows us to use
29321 // pmulhw to calculate the full 16-bit product. This trick means we don't
29322 // need to sign extend the bytes to use pmullw.
29323
29324 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29325 SDValue Zero = DAG.getConstant(0, dl, VT);
29326
29327 SDValue ALo, AHi;
29328 if (IsSigned) {
29329 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29330 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29331 } else {
29332 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29333 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29334 }
29335
29336 SDValue BLo, BHi;
29337 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29338 // If the RHS is a constant, manually unpackl/unpackh and extend.
29339 SmallVector<SDValue, 16> LoOps, HiOps;
29340 for (unsigned i = 0; i != NumElts; i += 16) {
29341 for (unsigned j = 0; j != 8; ++j) {
29342 SDValue LoOp = B.getOperand(i + j);
29343 SDValue HiOp = B.getOperand(i + j + 8);
29344
29345 if (IsSigned) {
29346 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29347 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29348 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29349 DAG.getConstant(8, dl, MVT::i16));
29350 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29351 DAG.getConstant(8, dl, MVT::i16));
29352 } else {
29353 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29354 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29355 }
29356
29357 LoOps.push_back(LoOp);
29358 HiOps.push_back(HiOp);
29359 }
29360 }
29361
29362 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29363 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29364 } else if (IsSigned) {
29365 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29366 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29367 } else {
29368 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29369 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29370 }
29371
29372 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29373 // pack back to vXi8.
29374 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29375 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29376 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29377
29378 if (Low)
29379 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29380
29381 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29382}
29383
29384static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29385 SelectionDAG &DAG) {
29386 SDLoc dl(Op);
29387 MVT VT = Op.getSimpleValueType();
29388 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29389 unsigned NumElts = VT.getVectorNumElements();
29390 SDValue A = Op.getOperand(0);
29391 SDValue B = Op.getOperand(1);
29392
29393 // Decompose 256-bit ops into 128-bit ops.
29394 if (VT.is256BitVector() && !Subtarget.hasInt256())
29395 return splitVectorIntBinary(Op, DAG, dl);
29396
29397 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29398 return splitVectorIntBinary(Op, DAG, dl);
29399
29400 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29401 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29402 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29403 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29404
29405 // PMULxD operations multiply each even value (starting at 0) of LHS with
29406 // the related value of RHS and produce a widen result.
29407 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29408 // => <2 x i64> <ae|cg>
29409 //
29410 // In other word, to have all the results, we need to perform two PMULxD:
29411 // 1. one with the even values.
29412 // 2. one with the odd values.
29413 // To achieve #2, with need to place the odd values at an even position.
29414 //
29415 // Place the odd value at an even position (basically, shift all values 1
29416 // step to the left):
29417 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29418 9, -1, 11, -1, 13, -1, 15, -1};
29419 // <a|b|c|d> => <b|undef|d|undef>
29420 SDValue Odd0 =
29421 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29422 // <e|f|g|h> => <f|undef|h|undef>
29423 SDValue Odd1 =
29424 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29425
29426 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29427 // ints.
29428 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29429 unsigned Opcode =
29430 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29431 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29432 // => <2 x i64> <ae|cg>
29433 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29434 DAG.getBitcast(MulVT, A),
29435 DAG.getBitcast(MulVT, B)));
29436 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29437 // => <2 x i64> <bf|dh>
29438 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29439 DAG.getBitcast(MulVT, Odd0),
29440 DAG.getBitcast(MulVT, Odd1)));
29441
29442 // Shuffle it back into the right order.
29443 SmallVector<int, 16> ShufMask(NumElts);
29444 for (int i = 0; i != (int)NumElts; ++i)
29445 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29446
29447 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29448
29449 // If we have a signed multiply but no PMULDQ fix up the result of an
29450 // unsigned multiply.
29451 if (IsSigned && !Subtarget.hasSSE41()) {
29452 SDValue Zero = DAG.getConstant(0, dl, VT);
29453 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29454 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29455 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29456 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29457
29458 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29459 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29460 }
29461
29462 return Res;
29463 }
29464
29465 // Only i8 vectors should need custom lowering after this.
29466 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29467 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29468 "Unsupported vector type");
29469
29470 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29471 // logical shift down the upper half and pack back to i8.
29472
29473 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29474 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29475
29476 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29477 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29478 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29479 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29480 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29481 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29482 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29483 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29484 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29485 }
29486
29487 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29488}
29489
29490// Custom lowering for SMULO/UMULO.
29491static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29492 SelectionDAG &DAG) {
29493 MVT VT = Op.getSimpleValueType();
29494
29495 // Scalars defer to LowerXALUO.
29496 if (!VT.isVector())
29497 return LowerXALUO(Op, DAG);
29498
29499 SDLoc dl(Op);
29500 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29501 SDValue A = Op.getOperand(0);
29502 SDValue B = Op.getOperand(1);
29503 EVT OvfVT = Op->getValueType(1);
29504
29505 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29506 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29507 // Extract the LHS Lo/Hi vectors
29508 SDValue LHSLo, LHSHi;
29509 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29510
29511 // Extract the RHS Lo/Hi vectors
29512 SDValue RHSLo, RHSHi;
29513 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29514
29515 EVT LoOvfVT, HiOvfVT;
29516 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29517 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29518 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29519
29520 // Issue the split operations.
29521 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29522 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29523
29524 // Join the separate data results and the overflow results.
29525 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29526 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29527 Hi.getValue(1));
29528
29529 return DAG.getMergeValues({Res, Ovf}, dl);
29530 }
29531
29532 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29533 EVT SetccVT =
29534 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29535
29536 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29537 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29538 unsigned NumElts = VT.getVectorNumElements();
29539 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29540 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29541 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29542 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29543 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29544
29545 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29546
29547 SDValue Ovf;
29548 if (IsSigned) {
29549 SDValue High, LowSign;
29550 if (OvfVT.getVectorElementType() == MVT::i1 &&
29551 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29552 // Rather the truncating try to do the compare on vXi16 or vXi32.
29553 // Shift the high down filling with sign bits.
29554 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29555 // Fill all 16 bits with the sign bit from the low.
29556 LowSign =
29557 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29558 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29559 15, DAG);
29560 SetccVT = OvfVT;
29561 if (!Subtarget.hasBWI()) {
29562 // We can't do a vXi16 compare so sign extend to v16i32.
29563 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29564 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29565 }
29566 } else {
29567 // Otherwise do the compare at vXi8.
29568 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29569 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29570 LowSign =
29571 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29572 }
29573
29574 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29575 } else {
29576 SDValue High =
29577 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29578 if (OvfVT.getVectorElementType() == MVT::i1 &&
29579 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29580 // Rather the truncating try to do the compare on vXi16 or vXi32.
29581 SetccVT = OvfVT;
29582 if (!Subtarget.hasBWI()) {
29583 // We can't do a vXi16 compare so sign extend to v16i32.
29584 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
29585 }
29586 } else {
29587 // Otherwise do the compare at vXi8.
29588 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29589 }
29590
29591 Ovf =
29592 DAG.getSetCC(dl, SetccVT, High,
29593 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
29594 }
29595
29596 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29597
29598 return DAG.getMergeValues({Low, Ovf}, dl);
29599 }
29600
29601 SDValue Low;
29602 SDValue High =
29603 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
29604
29605 SDValue Ovf;
29606 if (IsSigned) {
29607 // SMULO overflows if the high bits don't match the sign of the low.
29608 SDValue LowSign =
29609 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29610 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29611 } else {
29612 // UMULO overflows if the high bits are non-zero.
29613 Ovf =
29614 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
29615 }
29616
29617 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
29618
29619 return DAG.getMergeValues({Low, Ovf}, dl);
29620}
29621
29622SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
29623 assert(Subtarget.isTargetWin64() && "Unexpected target");
29624 EVT VT = Op.getValueType();
29625 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29626 "Unexpected return type for lowering");
29627
29628 if (isa<ConstantSDNode>(Op->getOperand(1))) {
29630 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
29631 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
29632 }
29633
29634 RTLIB::Libcall LC;
29635 bool isSigned;
29636 switch (Op->getOpcode()) {
29637 // clang-format off
29638 default: llvm_unreachable("Unexpected request for libcall!");
29639 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
29640 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
29641 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
29642 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
29643 // clang-format on
29644 }
29645
29646 SDLoc dl(Op);
29647 SDValue InChain = DAG.getEntryNode();
29648
29651 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
29652 EVT ArgVT = Op->getOperand(i).getValueType();
29653 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29654 "Unexpected argument type for lowering");
29655 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29656 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29657 MachinePointerInfo MPI =
29659 Entry.Node = StackPtr;
29660 InChain =
29661 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
29662 Entry.Ty = PointerType::get(*DAG.getContext(), 0);
29663 Entry.IsSExt = false;
29664 Entry.IsZExt = false;
29665 Args.push_back(Entry);
29666 }
29667
29670
29672 CLI.setDebugLoc(dl)
29673 .setChain(InChain)
29674 .setLibCallee(
29676 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
29677 std::move(Args))
29678 .setInRegister()
29679 .setSExtResult(isSigned)
29680 .setZExtResult(!isSigned);
29681
29682 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
29683 return DAG.getBitcast(VT, CallInfo.first);
29684}
29685
29686SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
29687 SelectionDAG &DAG,
29688 SDValue &Chain) const {
29689 assert(Subtarget.isTargetWin64() && "Unexpected target");
29690 EVT VT = Op.getValueType();
29691 bool IsStrict = Op->isStrictFPOpcode();
29692
29693 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29694 EVT ArgVT = Arg.getValueType();
29695
29696 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
29697 "Unexpected return type for lowering");
29698
29699 RTLIB::Libcall LC;
29700 if (Op->getOpcode() == ISD::FP_TO_SINT ||
29701 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
29702 LC = RTLIB::getFPTOSINT(ArgVT, VT);
29703 else
29704 LC = RTLIB::getFPTOUINT(ArgVT, VT);
29705 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29706
29707 SDLoc dl(Op);
29708 MakeLibCallOptions CallOptions;
29709 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29710
29712 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
29713 // expected VT (i128).
29714 std::tie(Result, Chain) =
29715 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
29716 Result = DAG.getBitcast(VT, Result);
29717 return Result;
29718}
29719
29720SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
29721 SelectionDAG &DAG) const {
29722 assert(Subtarget.isTargetWin64() && "Unexpected target");
29723 EVT VT = Op.getValueType();
29724 bool IsStrict = Op->isStrictFPOpcode();
29725
29726 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
29727 EVT ArgVT = Arg.getValueType();
29728
29729 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29730 "Unexpected argument type for lowering");
29731
29732 RTLIB::Libcall LC;
29733 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29734 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29735 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29736 else
29737 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29738 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29739
29740 SDLoc dl(Op);
29741 MakeLibCallOptions CallOptions;
29742 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29743
29744 // Pass the i128 argument as an indirect argument on the stack.
29745 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29746 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29747 MachinePointerInfo MPI =
29749 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29750
29752 std::tie(Result, Chain) =
29753 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29754 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29755}
29756
29757// Return true if the required (according to Opcode) shift-imm form is natively
29758// supported by the Subtarget
29759static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29760 unsigned Opcode) {
29761 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29762 "Unexpected shift opcode");
29763
29764 if (!VT.isSimple())
29765 return false;
29766
29767 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29768 return false;
29769
29770 if (VT.getScalarSizeInBits() < 16)
29771 return false;
29772
29773 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29774 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29775 return true;
29776
29777 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29778 (VT.is256BitVector() && Subtarget.hasInt256());
29779
29780 bool AShift = LShift && (Subtarget.hasAVX512() ||
29781 (VT != MVT::v2i64 && VT != MVT::v4i64));
29782 return (Opcode == ISD::SRA) ? AShift : LShift;
29783}
29784
29785// The shift amount is a variable, but it is the same for all vector lanes.
29786// These instructions are defined together with shift-immediate.
29787static
29789 unsigned Opcode) {
29790 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29791}
29792
29793// Return true if the required (according to Opcode) variable-shift form is
29794// natively supported by the Subtarget
29795static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29796 unsigned Opcode) {
29797 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29798 "Unexpected shift opcode");
29799
29800 if (!VT.isSimple())
29801 return false;
29802
29803 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29804 return false;
29805
29806 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29807 return false;
29808
29809 // vXi16 supported only on AVX-512, BWI
29810 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29811 return false;
29812
29813 if (Subtarget.hasAVX512() &&
29814 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29815 return true;
29816
29817 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29818 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29819 return (Opcode == ISD::SRA) ? AShift : LShift;
29820}
29821
29823 const X86Subtarget &Subtarget) {
29824 MVT VT = Op.getSimpleValueType();
29825 SDLoc dl(Op);
29826 SDValue R = Op.getOperand(0);
29827 SDValue Amt = Op.getOperand(1);
29828 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29829 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29830
29831 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29832 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29833 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29834 SDValue Ex = DAG.getBitcast(ExVT, R);
29835
29836 // ashr(R, 63) === cmp_slt(R, 0)
29837 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29838 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29839 "Unsupported PCMPGT op");
29840 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29841 }
29842
29843 if (ShiftAmt >= 32) {
29844 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29845 SDValue Upper =
29846 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29848 ShiftAmt - 32, DAG);
29849 if (VT == MVT::v2i64)
29850 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29851 if (VT == MVT::v4i64)
29852 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29853 {9, 1, 11, 3, 13, 5, 15, 7});
29854 } else {
29855 // SRA upper i32, SRL whole i64 and select lower i32.
29857 ShiftAmt, DAG);
29858 SDValue Lower =
29859 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29860 Lower = DAG.getBitcast(ExVT, Lower);
29861 if (VT == MVT::v2i64)
29862 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29863 if (VT == MVT::v4i64)
29864 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29865 {8, 1, 10, 3, 12, 5, 14, 7});
29866 }
29867 return DAG.getBitcast(VT, Ex);
29868 };
29869
29870 // Optimize shl/srl/sra with constant shift amount.
29871 APInt APIntShiftAmt;
29872 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29873 return SDValue();
29874
29875 // If the shift amount is out of range, return undef.
29876 if (APIntShiftAmt.uge(EltSizeInBits))
29877 return DAG.getUNDEF(VT);
29878
29879 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29880
29881 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29882 // Hardware support for vector shifts is sparse which makes us scalarize the
29883 // vector operations in many cases. Also, on sandybridge ADD is faster than
29884 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29885 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29886 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29887 // must be 0). (add undef, undef) however can be any value. To make this
29888 // safe, we must freeze R to ensure that register allocation uses the same
29889 // register for an undefined value. This ensures that the result will
29890 // still be even and preserves the original semantics.
29891 R = DAG.getFreeze(R);
29892 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29893 }
29894
29895 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29896 }
29897
29898 // i64 SRA needs to be performed as partial shifts.
29899 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29900 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29901 Op.getOpcode() == ISD::SRA)
29902 return ArithmeticShiftRight64(ShiftAmt);
29903
29904 // If we're logical shifting an all-signbits value then we can just perform as
29905 // a mask.
29906 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29907 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29908 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29909 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29910 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29911 }
29912
29913 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29914 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29915 unsigned NumElts = VT.getVectorNumElements();
29916 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29917
29918 // Simple i8 add case
29919 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29920 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29921 // must be 0). (add undef, undef) however can be any value. To make this
29922 // safe, we must freeze R to ensure that register allocation uses the same
29923 // register for an undefined value. This ensures that the result will
29924 // still be even and preserves the original semantics.
29925 R = DAG.getFreeze(R);
29926 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29927 }
29928
29929 // ashr(R, 7) === cmp_slt(R, 0)
29930 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29931 SDValue Zeros = DAG.getConstant(0, dl, VT);
29932 if (VT.is512BitVector()) {
29933 assert(VT == MVT::v64i8 && "Unexpected element type!");
29934 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29935 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29936 }
29937 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29938 }
29939
29940 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29941 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29942 return SDValue();
29943
29944 if (Subtarget.hasGFNI()) {
29945 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
29946 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29947 DAG.getTargetConstant(0, dl, MVT::i8));
29948 }
29949
29950 if (Op.getOpcode() == ISD::SHL) {
29951 // Make a large shift.
29952 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29953 ShiftAmt, DAG);
29954 SHL = DAG.getBitcast(VT, SHL);
29955 // Zero out the rightmost bits.
29956 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29957 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29958 }
29959 if (Op.getOpcode() == ISD::SRL) {
29960 // Make a large shift.
29961 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29962 ShiftAmt, DAG);
29963 SRL = DAG.getBitcast(VT, SRL);
29964 // Zero out the leftmost bits.
29965 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29966 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29967 }
29968 if (Op.getOpcode() == ISD::SRA) {
29969 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29970 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29971
29972 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29973 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29974 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29975 return Res;
29976 }
29977 llvm_unreachable("Unknown shift opcode.");
29978 }
29979
29980 return SDValue();
29981}
29982
29984 const X86Subtarget &Subtarget) {
29985 MVT VT = Op.getSimpleValueType();
29986 SDLoc dl(Op);
29987 SDValue R = Op.getOperand(0);
29988 SDValue Amt = Op.getOperand(1);
29989 unsigned Opcode = Op.getOpcode();
29990 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29991
29992 int BaseShAmtIdx = -1;
29993 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29994 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29995 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29996 Subtarget, DAG);
29997
29998 // vXi8 shifts - shift as v8i16 + mask result.
29999 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30000 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30001 VT == MVT::v64i8) &&
30002 !Subtarget.hasXOP()) {
30003 unsigned NumElts = VT.getVectorNumElements();
30004 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30005 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30006 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30007 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30008
30009 // Create the mask using vXi16 shifts. For shift-rights we need to move
30010 // the upper byte down before splatting the vXi8 mask.
30011 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30012 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30013 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30014 if (Opcode != ISD::SHL)
30015 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30016 8, DAG);
30017 BitMask = DAG.getBitcast(VT, BitMask);
30018 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30019 SmallVector<int, 64>(NumElts, 0));
30020
30021 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30022 DAG.getBitcast(ExtVT, R), BaseShAmt,
30023 BaseShAmtIdx, Subtarget, DAG);
30024 Res = DAG.getBitcast(VT, Res);
30025 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30026
30027 if (Opcode == ISD::SRA) {
30028 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30029 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30030 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30031 SignMask =
30032 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30033 BaseShAmtIdx, Subtarget, DAG);
30034 SignMask = DAG.getBitcast(VT, SignMask);
30035 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30036 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30037 }
30038 return Res;
30039 }
30040 }
30041 }
30042
30043 return SDValue();
30044}
30045
30046// Convert a shift/rotate left amount to a multiplication scale factor.
30048 const X86Subtarget &Subtarget,
30049 SelectionDAG &DAG) {
30050 MVT VT = Amt.getSimpleValueType();
30051 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30052 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30053 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30054 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30055 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30056 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30057 return SDValue();
30058
30059 MVT SVT = VT.getVectorElementType();
30060 unsigned SVTBits = SVT.getSizeInBits();
30061 unsigned NumElems = VT.getVectorNumElements();
30062
30063 APInt UndefElts;
30064 SmallVector<APInt> EltBits;
30065 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30066 APInt One(SVTBits, 1);
30067 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30068 for (unsigned I = 0; I != NumElems; ++I) {
30069 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30070 continue;
30071 uint64_t ShAmt = EltBits[I].getZExtValue();
30072 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30073 }
30074 return DAG.getBuildVector(VT, dl, Elts);
30075 }
30076
30077 // If the target doesn't support variable shifts, use either FP conversion
30078 // or integer multiplication to avoid shifting each element individually.
30079 if (VT == MVT::v4i32) {
30080 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30081 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30082 DAG.getConstant(0x3f800000U, dl, VT));
30083 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30084 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30085 }
30086
30087 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30088 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30089 SDValue Z = DAG.getConstant(0, dl, VT);
30090 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30091 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30092 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30093 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30094 if (Subtarget.hasSSE41())
30095 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30096 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30097 }
30098
30099 return SDValue();
30100}
30101
30102static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30103 SelectionDAG &DAG) {
30104 MVT VT = Op.getSimpleValueType();
30105 SDLoc dl(Op);
30106 SDValue R = Op.getOperand(0);
30107 SDValue Amt = Op.getOperand(1);
30108 unsigned NumElts = VT.getVectorNumElements();
30109 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30110 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30111
30112 unsigned Opc = Op.getOpcode();
30113 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30114 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30115
30116 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30117 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30118
30119 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30120 return V;
30121
30122 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30123 return V;
30124
30125 if (supportedVectorVarShift(VT, Subtarget, Opc))
30126 return Op;
30127
30128 // i64 vector arithmetic shift can be emulated with the transform:
30129 // M = lshr(SIGN_MASK, Amt)
30130 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30131 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30132 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30133 Opc == ISD::SRA) {
30134 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30135 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30136 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30137 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30138 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30139 return R;
30140 }
30141
30142 // XOP has 128-bit variable logical/arithmetic shifts.
30143 // +ve/-ve Amt = shift left/right.
30144 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30145 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30146 if (Opc == ISD::SRL || Opc == ISD::SRA)
30147 Amt = DAG.getNegative(Amt, dl, VT);
30148 if (Opc == ISD::SHL || Opc == ISD::SRL)
30149 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30150 if (Opc == ISD::SRA)
30151 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30152 }
30153
30154 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30155 // shifts per-lane and then shuffle the partial results back together.
30156 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30157 // Splat the shift amounts so the scalar shifts above will catch it.
30158 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30159 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30160 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30161 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30162 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30163 }
30164
30165 // Build a map of inrange constant amounts with element mask where they occur.
30167 if (ConstantAmt) {
30168 for (unsigned I = 0; I != NumElts; ++I) {
30169 SDValue A = Amt.getOperand(I);
30170 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30171 continue;
30172 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30173 if (UniqueCstAmt.count(CstAmt)) {
30174 UniqueCstAmt[CstAmt].setBit(I);
30175 continue;
30176 }
30177 UniqueCstAmt[CstAmt] = APInt::getOneBitSet(NumElts, I);
30178 }
30179 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30180 }
30181
30182 // If possible, lower this shift as a sequence of two shifts by
30183 // constant plus a BLENDing shuffle instead of scalarizing it.
30184 // Example:
30185 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30186 //
30187 // Could be rewritten as:
30188 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30189 //
30190 // The advantage is that the two shifts from the example would be
30191 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30192 if (UniqueCstAmt.size() == 2 &&
30193 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30194 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30195 unsigned AmtA = UniqueCstAmt.begin()->first;
30196 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30197 const APInt &MaskA = UniqueCstAmt.begin()->second;
30198 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30199 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30200 for (unsigned I = 0; I != NumElts; ++I) {
30201 if (MaskA[I])
30202 ShuffleMask[I] = I;
30203 if (MaskB[I])
30204 ShuffleMask[I] = I + NumElts;
30205 }
30206
30207 // Only perform this blend if we can perform it without loading a mask.
30208 if ((VT != MVT::v16i16 ||
30209 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30210 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30211 canWidenShuffleElements(ShuffleMask))) {
30212 SDValue Shift1 =
30213 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30214 SDValue Shift2 =
30215 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30216 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30217 }
30218 }
30219
30220 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30221 // using vYiM vector operations where X*N == Y*M and M > N.
30222 if (ConstantAmt &&
30223 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30224 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30225 !Subtarget.hasXOP()) {
30226 MVT NarrowScalarVT = VT.getScalarType();
30227 // We can do this extra fast if each pair of narrow elements is shifted by
30228 // the same amount by doing this SWAR style: use a shift to move the valid
30229 // bits to the right position, mask out any bits which crossed from one
30230 // element to the other.
30231 // This optimized lowering is only valid if the elements in a pair can
30232 // be treated identically.
30233 SmallVector<SDValue, 32> AmtWideElts(Amt->op_begin(), Amt->op_end());
30234 SmallVector<SDValue, 32> TmpAmtWideElts;
30235 int WideEltSizeInBits = EltSizeInBits;
30236 while (WideEltSizeInBits < 32) {
30237 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30238 // unprofitable.
30239 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30240 break;
30241 }
30242 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30243 bool SameShifts = true;
30244 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30245 unsigned DstI = SrcI / 2;
30246 // Both elements are undef? Make a note and keep going.
30247 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30248 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30249 continue;
30250 }
30251 // Even element is undef? We will shift it by the same shift amount as
30252 // the odd element.
30253 if (AmtWideElts[SrcI].isUndef()) {
30254 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30255 continue;
30256 }
30257 // Odd element is undef? We will shift it by the same shift amount as
30258 // the even element.
30259 if (AmtWideElts[SrcI + 1].isUndef()) {
30260 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30261 continue;
30262 }
30263 // Both elements are equal.
30264 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30265 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30266 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30267 continue;
30268 }
30269 // One of the provisional wide elements will not have the same shift
30270 // amount. Let's bail.
30271 SameShifts = false;
30272 break;
30273 }
30274 if (!SameShifts) {
30275 break;
30276 }
30277 WideEltSizeInBits *= 2;
30278 std::swap(TmpAmtWideElts, AmtWideElts);
30279 }
30280 APInt APIntShiftAmt;
30281 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30282 bool Profitable = WidenShift;
30283 // AVX512BW brings support for vpsllvw.
30284 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30285 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30286 Profitable = false;
30287 }
30288 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30289 // fairly cheaply in other ways.
30290 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30291 Profitable = false;
30292 }
30293 // Leave it up to GFNI if we have it around.
30294 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30295 // is probably a win to use other strategies in some cases.
30296 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30297 Profitable = false;
30298 }
30299
30300 // AVX1 does not have vpand which makes our masking impractical. It does
30301 // have vandps but that is an FP instruction and crossing FP<->int typically
30302 // has some cost.
30303 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30304 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30305 Profitable = false;
30306 }
30307 unsigned WideNumElts = AmtWideElts.size();
30308 // We are only dealing with identical pairs.
30309 if (Profitable && WideNumElts != NumElts) {
30310 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30311 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30312 // Cast the operand to vXiM.
30313 SDValue RWide = DAG.getBitcast(WideVT, R);
30314 // Create our new vector of shift amounts.
30315 SDValue AmtWide = DAG.getBuildVector(
30316 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30317 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30318 // Perform the actual shift.
30319 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30320 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30321 // Now we need to construct a mask which will "drop" bits that get
30322 // shifted past the LSB/MSB. For a logical shift left, it will look
30323 // like:
30324 // FullMask = (1 << EltSizeInBits) - 1
30325 // Mask = FullMask << Amt
30326 //
30327 // This masking ensures that bits cannot migrate from one narrow lane to
30328 // another. The construction of this mask will be constant folded.
30329 // The mask for a logical right shift is nearly identical, the only
30330 // difference is that the all ones mask is shifted right instead of left.
30331 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30332 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30333 Mask = DAG.getBitcast(WideVT, Mask);
30334 // Finally, we mask the shifted vector with the SWAR mask.
30335 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30336 Masked = DAG.getBitcast(VT, Masked);
30337 if (Opc != ISD::SRA) {
30338 // Logical shifts are complete at this point.
30339 return Masked;
30340 }
30341 // At this point, we have done a *logical* shift right. We now need to
30342 // sign extend the result so that we get behavior equivalent to an
30343 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30344 // are `EltSizeInBits-AmtWide` bits wide.
30345 //
30346 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30347 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30348 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30349 // can use the following trick to accomplish this:
30350 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30351 // (Masked ^ SignBitMask) - SignBitMask
30352 //
30353 // When the sign bit is already clear, this will compute:
30354 // Masked + SignBitMask - SignBitMask
30355 //
30356 // This is equal to Masked which is what we want: the sign bit was clear
30357 // so sign extending should be a no-op.
30358 //
30359 // When the sign bit is set, this will compute:
30360 // Masked - SignBitmask - SignBitMask
30361 //
30362 // This is equal to Masked - 2*SignBitMask which will correctly sign
30363 // extend our result.
30364 SDValue SplatHighBit =
30365 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30366 // This does not induce recursion, all operands are constants.
30367 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30368 SDValue FlippedSignBit =
30369 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30370 SDValue Subtraction =
30371 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30372 return Subtraction;
30373 }
30374 }
30375
30376 // If possible, lower this packed shift into a vector multiply instead of
30377 // expanding it into a sequence of scalar shifts.
30378 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30379 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30380 Subtarget.canExtendTo512BW())))
30381 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30382 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30383
30384 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30385 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30386 if (Opc == ISD::SRL && ConstantAmt &&
30387 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30388 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30389 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30390 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30391 SDValue Zero = DAG.getConstant(0, dl, VT);
30392 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30393 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30394 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30395 }
30396 }
30397
30398 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30399 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30400 // TODO: Special case handling for shift by 0/1, really we can afford either
30401 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30402 if (Opc == ISD::SRA && ConstantAmt &&
30403 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30404 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30405 !Subtarget.hasAVX512()) ||
30406 DAG.isKnownNeverZero(Amt))) {
30407 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30408 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30409 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30410 SDValue Amt0 =
30411 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30412 SDValue Amt1 =
30413 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30414 SDValue Sra1 =
30415 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30416 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30417 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30418 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30419 }
30420 }
30421
30422 // v4i32 Non Uniform Shifts.
30423 // If the shift amount is constant we can shift each lane using the SSE2
30424 // immediate shifts, else we need to zero-extend each lane to the lower i64
30425 // and shift using the SSE2 variable shifts.
30426 // The separate results can then be blended together.
30427 if (VT == MVT::v4i32) {
30428 SDValue Amt0, Amt1, Amt2, Amt3;
30429 if (ConstantAmt) {
30430 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30431 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30432 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30433 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30434 } else {
30435 // The SSE2 shifts use the lower i64 as the same shift amount for
30436 // all lanes and the upper i64 is ignored. On AVX we're better off
30437 // just zero-extending, but for SSE just duplicating the top 16-bits is
30438 // cheaper and has the same effect for out of range values.
30439 if (Subtarget.hasAVX()) {
30440 SDValue Z = DAG.getConstant(0, dl, VT);
30441 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30442 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30443 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30444 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30445 } else {
30446 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30447 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30448 {4, 5, 6, 7, -1, -1, -1, -1});
30449 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30450 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30451 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30452 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30453 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30454 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30455 }
30456 }
30457
30458 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30459 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30460 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30461 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30462 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30463
30464 // Merge the shifted lane results optimally with/without PBLENDW.
30465 // TODO - ideally shuffle combining would handle this.
30466 if (Subtarget.hasSSE41()) {
30467 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30468 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30469 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30470 }
30471 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30472 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30473 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30474 }
30475
30476 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30477 // look up the pre-computed shift values.
30478 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30479 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30480 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30481 unsigned NumLanes = VT.getSizeInBits() / 128u;
30482 unsigned NumEltsPerLane = NumElts / NumLanes;
30484 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30485 unsigned LoElt = Lane * NumEltsPerLane;
30486 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30487 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30488 if (!KnownLane.isConstant())
30489 break;
30490 const APInt &LaneSplat = KnownLane.getConstant();
30491 for (unsigned I = 0; I != 8; ++I) {
30492 if (Opc == ISD::SHL)
30493 LUT.push_back(LaneSplat.shl(I));
30494 else if (Opc == ISD::SRL)
30495 LUT.push_back(LaneSplat.lshr(I));
30496 else if (Opc == ISD::SRA)
30497 LUT.push_back(LaneSplat.ashr(I));
30498 }
30499 LUT.append(8, APInt::getZero(8));
30500 }
30501 if (LUT.size() == NumElts) {
30502 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30503 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30504 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30505 }
30506 }
30507
30508 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30509 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30510 // make the existing SSE solution better.
30511 // NOTE: We honor prefered vector width before promoting to 512-bits.
30512 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30513 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30514 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30515 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30516 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30517 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30518 "Unexpected vector type");
30519 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30520 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30521 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30522 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30523 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30524 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30525 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30526 }
30527
30528 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30529 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30530 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30531 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30532 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30533 !Subtarget.hasXOP()) {
30534 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30535 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30536
30537 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30538 // isn't legal).
30539 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30540 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30541 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30542 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30544 "Constant build vector expected");
30545
30546 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30547 bool IsSigned = Opc == ISD::SRA;
30548 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30549 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30550 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30551 return DAG.getZExtOrTrunc(R, dl, VT);
30552 }
30553
30554 SmallVector<SDValue, 16> LoAmt, HiAmt;
30555 for (unsigned i = 0; i != NumElts; i += 16) {
30556 for (int j = 0; j != 8; ++j) {
30557 LoAmt.push_back(Amt.getOperand(i + j));
30558 HiAmt.push_back(Amt.getOperand(i + j + 8));
30559 }
30560 }
30561
30562 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30563 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30564
30565 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30566 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30567 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30568 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30569 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30570 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30571 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30572 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30573 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30574 }
30575
30576 if (VT == MVT::v16i8 ||
30577 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30578 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30579 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30580
30581 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30582 if (VT.is512BitVector()) {
30583 // On AVX512BW targets we make use of the fact that VSELECT lowers
30584 // to a masked blend which selects bytes based just on the sign bit
30585 // extracted to a mask.
30586 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30587 V0 = DAG.getBitcast(VT, V0);
30588 V1 = DAG.getBitcast(VT, V1);
30589 Sel = DAG.getBitcast(VT, Sel);
30590 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
30591 ISD::SETGT);
30592 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
30593 } else if (Subtarget.hasSSE41()) {
30594 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30595 // on the sign bit.
30596 V0 = DAG.getBitcast(VT, V0);
30597 V1 = DAG.getBitcast(VT, V1);
30598 Sel = DAG.getBitcast(VT, Sel);
30599 return DAG.getBitcast(SelVT,
30600 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
30601 }
30602 // On pre-SSE41 targets we test for the sign bit by comparing to
30603 // zero - a negative value will set all bits of the lanes to true
30604 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30605 SDValue Z = DAG.getConstant(0, dl, SelVT);
30606 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
30607 return DAG.getSelect(dl, SelVT, C, V0, V1);
30608 };
30609
30610 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30611 // We can safely do this using i16 shifts as we're only interested in
30612 // the 3 lower bits of each byte.
30613 Amt = DAG.getBitcast(ExtVT, Amt);
30614 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
30615 Amt = DAG.getBitcast(VT, Amt);
30616
30617 if (Opc == ISD::SHL || Opc == ISD::SRL) {
30618 // r = VSELECT(r, shift(r, 4), a);
30619 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
30620 R = SignBitSelect(VT, Amt, M, R);
30621
30622 // a += a
30623 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30624
30625 // r = VSELECT(r, shift(r, 2), a);
30626 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
30627 R = SignBitSelect(VT, Amt, M, R);
30628
30629 // a += a
30630 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30631
30632 // return VSELECT(r, shift(r, 1), a);
30633 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
30634 R = SignBitSelect(VT, Amt, M, R);
30635 return R;
30636 }
30637
30638 if (Opc == ISD::SRA) {
30639 // For SRA we need to unpack each byte to the higher byte of a i16 vector
30640 // so we can correctly sign extend. We don't care what happens to the
30641 // lower byte.
30642 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30643 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
30644 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
30645 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
30646 ALo = DAG.getBitcast(ExtVT, ALo);
30647 AHi = DAG.getBitcast(ExtVT, AHi);
30648 RLo = DAG.getBitcast(ExtVT, RLo);
30649 RHi = DAG.getBitcast(ExtVT, RHi);
30650
30651 // r = VSELECT(r, shift(r, 4), a);
30652 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
30653 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
30654 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30655 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30656
30657 // a += a
30658 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30659 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30660
30661 // r = VSELECT(r, shift(r, 2), a);
30662 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
30663 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
30664 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30665 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30666
30667 // a += a
30668 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
30669 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
30670
30671 // r = VSELECT(r, shift(r, 1), a);
30672 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
30673 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
30674 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
30675 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
30676
30677 // Logical shift the result back to the lower byte, leaving a zero upper
30678 // byte meaning that we can safely pack with PACKUSWB.
30679 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
30680 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
30681 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
30682 }
30683 }
30684
30685 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
30686 MVT ExtVT = MVT::v8i32;
30687 SDValue Z = DAG.getConstant(0, dl, VT);
30688 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
30689 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
30690 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
30691 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
30692 ALo = DAG.getBitcast(ExtVT, ALo);
30693 AHi = DAG.getBitcast(ExtVT, AHi);
30694 RLo = DAG.getBitcast(ExtVT, RLo);
30695 RHi = DAG.getBitcast(ExtVT, RHi);
30696 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
30697 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
30698 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
30699 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
30700 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30701 }
30702
30703 if (VT == MVT::v8i16) {
30704 // If we have a constant shift amount, the non-SSE41 path is best as
30705 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
30706 bool UseSSE41 = Subtarget.hasSSE41() &&
30708
30709 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
30710 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
30711 // the sign bit.
30712 if (UseSSE41) {
30713 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
30714 V0 = DAG.getBitcast(ExtVT, V0);
30715 V1 = DAG.getBitcast(ExtVT, V1);
30716 Sel = DAG.getBitcast(ExtVT, Sel);
30717 return DAG.getBitcast(
30718 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
30719 }
30720 // On pre-SSE41 targets we splat the sign bit - a negative value will
30721 // set all bits of the lanes to true and VSELECT uses that in
30722 // its OR(AND(V0,C),AND(V1,~C)) lowering.
30723 SDValue C =
30724 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
30725 return DAG.getSelect(dl, VT, C, V0, V1);
30726 };
30727
30728 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
30729 if (UseSSE41) {
30730 // On SSE41 targets we need to replicate the shift mask in both
30731 // bytes for PBLENDVB.
30732 Amt = DAG.getNode(
30733 ISD::OR, dl, VT,
30734 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
30735 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
30736 } else {
30737 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
30738 }
30739
30740 // r = VSELECT(r, shift(r, 8), a);
30741 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
30742 R = SignBitSelect(Amt, M, R);
30743
30744 // a += a
30745 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30746
30747 // r = VSELECT(r, shift(r, 4), a);
30748 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
30749 R = SignBitSelect(Amt, M, R);
30750
30751 // a += a
30752 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30753
30754 // r = VSELECT(r, shift(r, 2), a);
30755 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
30756 R = SignBitSelect(Amt, M, R);
30757
30758 // a += a
30759 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
30760
30761 // return VSELECT(r, shift(r, 1), a);
30762 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
30763 R = SignBitSelect(Amt, M, R);
30764 return R;
30765 }
30766
30767 // Decompose 256-bit shifts into 128-bit shifts.
30768 if (VT.is256BitVector())
30769 return splitVectorIntBinary(Op, DAG, dl);
30770
30771 if (VT == MVT::v32i16 || VT == MVT::v64i8)
30772 return splitVectorIntBinary(Op, DAG, dl);
30773
30774 return SDValue();
30775}
30776
30778 SelectionDAG &DAG) {
30779 MVT VT = Op.getSimpleValueType();
30780 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
30781 "Unexpected funnel shift opcode!");
30782
30783 SDLoc DL(Op);
30784 SDValue Op0 = Op.getOperand(0);
30785 SDValue Op1 = Op.getOperand(1);
30786 SDValue Amt = Op.getOperand(2);
30787 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30788 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
30789
30790 if (VT.isVector()) {
30791 APInt APIntShiftAmt;
30792 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30793 unsigned NumElts = VT.getVectorNumElements();
30794
30795 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
30796 if (IsFSHR)
30797 std::swap(Op0, Op1);
30798
30799 if (IsCstSplat) {
30800 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30801 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
30802 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
30803 {Op0, Op1, Imm}, DAG, Subtarget);
30804 }
30805 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
30806 {Op0, Op1, Amt}, DAG, Subtarget);
30807 }
30808 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30809 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
30810 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
30811 "Unexpected funnel shift type!");
30812
30813 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
30814 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
30815 if (IsCstSplat) {
30816 // TODO: Can't use generic expansion as UNDEF amt elements can be
30817 // converted to other values when folded to shift amounts, losing the
30818 // splat.
30819 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
30820 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
30821 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
30822 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
30823 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30824
30825 if (EltSizeInBits == 8 &&
30826 (Subtarget.hasXOP() ||
30827 (useVPTERNLOG(Subtarget, VT) &&
30828 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
30829 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
30830 // bit-select - lower using vXi16 shifts and then perform the bitmask at
30831 // the original vector width to handle cases where we split.
30832 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
30833 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
30834 SDValue ShX =
30835 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
30836 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
30837 SDValue ShY =
30838 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
30839 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
30840 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
30841 DAG.getConstant(MaskX, DL, VT));
30842 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
30843 DAG.getConstant(MaskY, DL, VT));
30844 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30845 }
30846
30847 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
30848 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
30849 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
30850 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
30851 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
30852 }
30853
30854 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30855 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30856 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
30857
30858 // Constant vXi16 funnel shifts can be efficiently handled by default.
30859 if (IsCst && EltSizeInBits == 16)
30860 return SDValue();
30861
30862 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
30863 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30864 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30865
30866 // Split 256-bit integers on XOP/pre-AVX2 targets.
30867 // Split 512-bit integers on non 512-bit BWI targets.
30868 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
30869 !Subtarget.hasAVX2())) ||
30870 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
30871 EltSizeInBits < 32)) {
30872 // Pre-mask the amount modulo using the wider vector.
30873 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
30874 return splitVectorOp(Op, DAG, DL);
30875 }
30876
30877 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
30878 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
30879 int ScalarAmtIdx = -1;
30880 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
30881 // Uniform vXi16 funnel shifts can be efficiently handled by default.
30882 if (EltSizeInBits == 16)
30883 return SDValue();
30884
30885 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30886 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30887 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
30888 ScalarAmtIdx, Subtarget, DAG);
30889 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
30890 ScalarAmtIdx, Subtarget, DAG);
30891 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30892 }
30893 }
30894
30895 MVT WideSVT = MVT::getIntegerVT(
30896 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30897 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30898
30899 // If per-element shifts are legal, fallback to generic expansion.
30900 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30901 return SDValue();
30902
30903 // Attempt to fold as:
30904 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30905 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30906 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30907 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30908 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30909 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30910 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30911 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30912 EltSizeInBits, DAG);
30913 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30914 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30915 if (!IsFSHR)
30916 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30917 EltSizeInBits, DAG);
30918 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30919 }
30920
30921 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30922 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30923 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30924 SDValue Z = DAG.getConstant(0, DL, VT);
30925 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30926 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30927 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30928 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30929 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30930 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30931 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30932 }
30933
30934 // Fallback to generic expansion.
30935 return SDValue();
30936 }
30937 assert(
30938 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
30939 "Unexpected funnel shift type!");
30940
30941 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30942 bool OptForSize = DAG.shouldOptForSize();
30943 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30944
30945 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30946 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30947 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30948 !isa<ConstantSDNode>(Amt)) {
30949 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30950 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30951 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30952 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30953 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30954 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30955 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30956 if (IsFSHR) {
30957 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30958 } else {
30959 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30960 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30961 }
30962 return DAG.getZExtOrTrunc(Res, DL, VT);
30963 }
30964
30965 if (VT == MVT::i8 || ExpandFunnel)
30966 return SDValue();
30967
30968 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30969 if (VT == MVT::i16) {
30970 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30971 DAG.getConstant(15, DL, Amt.getValueType()));
30972 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30973 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30974 }
30975
30976 return Op;
30977}
30978
30979static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30980 SelectionDAG &DAG) {
30981 MVT VT = Op.getSimpleValueType();
30982 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30983
30984 SDLoc DL(Op);
30985 SDValue R = Op.getOperand(0);
30986 SDValue Amt = Op.getOperand(1);
30987 unsigned Opcode = Op.getOpcode();
30988 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30989 int NumElts = VT.getVectorNumElements();
30990 bool IsROTL = Opcode == ISD::ROTL;
30991
30992 // Check for constant splat rotation amount.
30993 APInt CstSplatValue;
30994 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30995
30996 // Check for splat rotate by zero.
30997 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30998 return R;
30999
31000 // AVX512 implicitly uses modulo rotation amounts.
31001 if ((Subtarget.hasVLX() ||
31002 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
31003 32 <= EltSizeInBits) {
31004 // Attempt to rotate by immediate.
31005 if (IsCstSplat) {
31006 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31007 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31008 return DAG.getNode(RotOpc, DL, VT, R,
31009 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31010 }
31011
31012 // Else, fall-back on VPROLV/VPRORV.
31013 return Op;
31014 }
31015
31016 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31017 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31018 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31019 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31020 }
31021
31022 SDValue Z = DAG.getConstant(0, DL, VT);
31023
31024 if (!IsROTL) {
31025 // If the ISD::ROTR amount is constant, we're always better converting to
31026 // ISD::ROTL.
31027 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31028 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31029
31030 // XOP targets always prefers ISD::ROTL.
31031 if (Subtarget.hasXOP())
31032 return DAG.getNode(ISD::ROTL, DL, VT, R,
31033 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31034 }
31035
31036 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31037 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31039 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31040 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31041 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31042 DAG.getTargetConstant(0, DL, MVT::i8));
31043 }
31044
31045 // Split 256-bit integers on XOP/pre-AVX2 targets.
31046 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31047 return splitVectorIntBinary(Op, DAG, DL);
31048
31049 // XOP has 128-bit vector variable + immediate rotates.
31050 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31051 // XOP implicitly uses modulo rotation amounts.
31052 if (Subtarget.hasXOP()) {
31053 assert(IsROTL && "Only ROTL expected");
31054 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31055
31056 // Attempt to rotate by immediate.
31057 if (IsCstSplat) {
31058 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31059 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31060 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31061 }
31062
31063 // Use general rotate by variable (per-element).
31064 return Op;
31065 }
31066
31067 // Rotate by an uniform constant - expand back to shifts.
31068 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31069 // to other values when folded to shift amounts, losing the splat.
31070 if (IsCstSplat) {
31071 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31072 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31073 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31074 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31075 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31076 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31077 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31078 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31079 }
31080
31081 // Split 512-bit integers on non 512-bit BWI targets.
31082 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31083 return splitVectorIntBinary(Op, DAG, DL);
31084
31085 assert(
31086 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31087 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31088 Subtarget.hasAVX2()) ||
31089 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31090 "Only vXi32/vXi16/vXi8 vector rotates supported");
31091
31092 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31093 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31094
31095 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31096 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31097
31098 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31099 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31100 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31101 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31102 int BaseRotAmtIdx = -1;
31103 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31104 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31105 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31106 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31107 }
31108 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31109 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31110 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31111 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31112 BaseRotAmtIdx, Subtarget, DAG);
31113 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31114 BaseRotAmtIdx, Subtarget, DAG);
31115 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31116 }
31117 }
31118
31119 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31120 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31121
31122 // Attempt to fold as unpack(x,x) << zext(y):
31123 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31124 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31125 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31126 if (!(ConstantAmt && EltSizeInBits != 8) &&
31127 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31128 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31129 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31130 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31131 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31132 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31133 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31134 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31135 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31136 }
31137
31138 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31139 // the amount bit.
31140 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31141 if (EltSizeInBits == 8) {
31142 MVT WideVT =
31143 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31144
31145 // Attempt to fold as:
31146 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31147 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31148 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31149 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31150 // If we're rotating by constant, just use default promotion.
31151 if (ConstantAmt)
31152 return SDValue();
31153 // See if we can perform this by widening to vXi16 or vXi32.
31154 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31155 R = DAG.getNode(
31156 ISD::OR, DL, WideVT, R,
31157 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31158 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31159 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31160 if (IsROTL)
31161 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31162 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31163 }
31164
31165 // We don't need ModuloAmt here as we just peek at individual bits.
31166 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31167 if (Subtarget.hasSSE41()) {
31168 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31169 // on the sign bit.
31170 V0 = DAG.getBitcast(VT, V0);
31171 V1 = DAG.getBitcast(VT, V1);
31172 Sel = DAG.getBitcast(VT, Sel);
31173 return DAG.getBitcast(SelVT,
31174 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31175 }
31176 // On pre-SSE41 targets we test for the sign bit by comparing to
31177 // zero - a negative value will set all bits of the lanes to true
31178 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31179 SDValue Z = DAG.getConstant(0, DL, SelVT);
31180 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31181 return DAG.getSelect(DL, SelVT, C, V0, V1);
31182 };
31183
31184 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31185 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31186 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31187 IsROTL = true;
31188 }
31189
31190 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31191 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31192
31193 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31194 // We can safely do this using i16 shifts as we're only interested in
31195 // the 3 lower bits of each byte.
31196 Amt = DAG.getBitcast(ExtVT, Amt);
31197 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31198 Amt = DAG.getBitcast(VT, Amt);
31199
31200 // r = VSELECT(r, rot(r, 4), a);
31201 SDValue M;
31202 M = DAG.getNode(
31203 ISD::OR, DL, VT,
31204 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31205 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31206 R = SignBitSelect(VT, Amt, M, R);
31207
31208 // a += a
31209 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31210
31211 // r = VSELECT(r, rot(r, 2), a);
31212 M = DAG.getNode(
31213 ISD::OR, DL, VT,
31214 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31215 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31216 R = SignBitSelect(VT, Amt, M, R);
31217
31218 // a += a
31219 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31220
31221 // return VSELECT(r, rot(r, 1), a);
31222 M = DAG.getNode(
31223 ISD::OR, DL, VT,
31224 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31225 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31226 return SignBitSelect(VT, Amt, M, R);
31227 }
31228
31229 bool IsSplatAmt = DAG.isSplatValue(Amt);
31230 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31231 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31232
31233 // Fallback for splats + all supported variable shifts.
31234 // Fallback for non-constants AVX2 vXi16 as well.
31235 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31236 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31237 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31238 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31239 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31240 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31241 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31242 }
31243
31244 // Everything below assumes ISD::ROTL.
31245 if (!IsROTL) {
31246 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31247 IsROTL = true;
31248 }
31249
31250 // ISD::ROT* uses modulo rotate amounts.
31251 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31252
31253 assert(IsROTL && "Only ROTL supported");
31254
31255 // As with shifts, attempt to convert the rotation amount to a multiplication
31256 // factor, fallback to general expansion.
31257 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31258 if (!Scale)
31259 return SDValue();
31260
31261 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31262 if (EltSizeInBits == 16) {
31263 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31264 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31265 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31266 }
31267
31268 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31269 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31270 // that can then be OR'd with the lower 32-bits.
31271 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31272 static const int OddMask[] = {1, 1, 3, 3};
31273 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31274 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31275
31276 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31277 DAG.getBitcast(MVT::v2i64, R),
31278 DAG.getBitcast(MVT::v2i64, Scale));
31279 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31280 DAG.getBitcast(MVT::v2i64, R13),
31281 DAG.getBitcast(MVT::v2i64, Scale13));
31282 Res02 = DAG.getBitcast(VT, Res02);
31283 Res13 = DAG.getBitcast(VT, Res13);
31284
31285 return DAG.getNode(ISD::OR, DL, VT,
31286 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31287 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31288}
31289
31290/// Returns true if the operand type is exactly twice the native width, and
31291/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31292/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31293/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31294bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31295 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31296
31297 if (OpWidth == 64)
31298 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31299 if (OpWidth == 128)
31300 return Subtarget.canUseCMPXCHG16B();
31301
31302 return false;
31303}
31304
31306X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31307 Type *MemType = SI->getValueOperand()->getType();
31308
31309 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31310 !Subtarget.useSoftFloat()) {
31311 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31312 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31314
31315 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31316 Subtarget.hasAVX())
31318 }
31319
31320 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31322}
31323
31324// Note: this turns large loads into lock cmpxchg8b/16b.
31326X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31327 Type *MemType = LI->getType();
31328
31329 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31330 !Subtarget.useSoftFloat()) {
31331 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31332 // can use movq to do the load. If we have X87 we can load into an 80-bit
31333 // X87 register and store it to a stack temporary.
31334 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31335 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31337
31338 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31339 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31340 Subtarget.hasAVX())
31342 }
31343
31344 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31346}
31347
31348enum BitTestKind : unsigned {
31355
31356static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31357 using namespace llvm::PatternMatch;
31358 BitTestKind BTK = UndefBit;
31359 if (auto *C = dyn_cast<ConstantInt>(V)) {
31360 // Check if V is a power of 2 or NOT power of 2.
31361 if (isPowerOf2_64(C->getZExtValue()))
31362 BTK = ConstantBit;
31363 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31364 BTK = NotConstantBit;
31365 return {V, BTK};
31366 }
31367
31368 // Check if V is some power of 2 pattern known to be non-zero
31369 if (auto *I = dyn_cast<Instruction>(V)) {
31370 bool Not = false;
31371 // Check if we have a NOT
31372 Value *PeekI;
31373 if (match(I, m_Not(m_Value(PeekI))) ||
31374 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31375 Not = true;
31376 I = dyn_cast<Instruction>(PeekI);
31377
31378 // If I is constant, it will fold and we can evaluate later. If its an
31379 // argument or something of that nature, we can't analyze.
31380 if (I == nullptr)
31381 return {nullptr, UndefBit};
31382 }
31383 // We can only use 1 << X without more sophisticated analysis. C << X where
31384 // C is a power of 2 but not 1 can result in zero which cannot be translated
31385 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31386 if (I->getOpcode() == Instruction::Shl) {
31387 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31388 // -X` and some other provable power of 2 patterns that we can use CTZ on
31389 // may be profitable.
31390 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31391 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31392 // be provably a non-zero power of 2.
31393 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31394 // transformable to bittest.
31395 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31396 if (!ShiftVal)
31397 return {nullptr, UndefBit};
31398 if (ShiftVal->equalsInt(1))
31399 BTK = Not ? NotShiftBit : ShiftBit;
31400
31401 if (BTK == UndefBit)
31402 return {nullptr, UndefBit};
31403
31404 Value *BitV = I->getOperand(1);
31405
31406 // Read past a shiftmask instruction to find count
31407 Value *AndOp;
31408 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31409 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31410 BitV = AndOp;
31411
31412 return {BitV, BTK};
31413 }
31414 }
31415 return {nullptr, UndefBit};
31416}
31417
31419X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31420 using namespace llvm::PatternMatch;
31421 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31422 // prefix to a normal instruction for these operations.
31423 if (AI->use_empty())
31425
31426 if (AI->getOperation() == AtomicRMWInst::Xor) {
31427 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31428 // preferable to both `cmpxchg` and `btc`.
31429 if (match(AI->getOperand(1), m_SignMask()))
31431 }
31432
31433 // If the atomicrmw's result is used by a single bit AND, we may use
31434 // bts/btr/btc instruction for these operations.
31435 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31436 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31437 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31438 // detect it.
31439 Instruction *I = AI->user_back();
31440 auto BitChange = FindSingleBitChange(AI->getValOperand());
31441 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31442 I->getOpcode() != Instruction::And ||
31443 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31444 AI->getParent() != I->getParent())
31446
31447 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31448
31449 // This is a redundant AND, it should get cleaned up elsewhere.
31450 if (AI == I->getOperand(OtherIdx))
31452
31453 // The following instruction must be a AND single bit.
31454 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31455 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31456 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31457 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31459 }
31460 if (AI->getOperation() == AtomicRMWInst::And) {
31461 return ~C1->getValue() == C2->getValue()
31464 }
31467 }
31468
31469 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31470
31471 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31472 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31474
31475 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31476
31477 // If shift amounts are not the same we can't use BitTestIntrinsic.
31478 if (BitChange.first != BitTested.first)
31480
31481 // If atomic AND need to be masking all be one bit and testing the one bit
31482 // unset in the mask.
31483 if (AI->getOperation() == AtomicRMWInst::And)
31484 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31487
31488 // If atomic XOR/OR need to be setting and testing the same bit.
31489 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31492}
31493
31494void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31495 IRBuilder<> Builder(AI);
31496 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31499 switch (AI->getOperation()) {
31500 default:
31501 llvm_unreachable("Unknown atomic operation");
31502 case AtomicRMWInst::Or:
31503 IID_C = Intrinsic::x86_atomic_bts;
31504 IID_I = Intrinsic::x86_atomic_bts_rm;
31505 break;
31506 case AtomicRMWInst::Xor:
31507 IID_C = Intrinsic::x86_atomic_btc;
31508 IID_I = Intrinsic::x86_atomic_btc_rm;
31509 break;
31510 case AtomicRMWInst::And:
31511 IID_C = Intrinsic::x86_atomic_btr;
31512 IID_I = Intrinsic::x86_atomic_btr_rm;
31513 break;
31514 }
31515 Instruction *I = AI->user_back();
31516 LLVMContext &Ctx = AI->getContext();
31517 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31519 Value *Result = nullptr;
31520 auto BitTested = FindSingleBitChange(AI->getValOperand());
31521 assert(BitTested.first != nullptr);
31522
31523 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31524 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31525
31526 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31527 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31528 {Addr, Builder.getInt8(Imm)});
31529 } else {
31530 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31531
31532 Value *SI = BitTested.first;
31533 assert(SI != nullptr);
31534
31535 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31536 // mask it.
31537 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31538 Value *BitPos =
31539 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31540 // Todo(1): In many cases it may be provable that SI is less than
31541 // ShiftBits in which case this mask is unnecessary
31542 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31543 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31544 // favor of just a raw BT{S|R|C}.
31545
31546 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31547 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31548
31549 // If the result is only used for zero/non-zero status then we don't need to
31550 // shift value back. Otherwise do so.
31551 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31552 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31553 if (ICmp->isEquality()) {
31554 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31555 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31556 if (C0 || C1) {
31557 assert(C0 == nullptr || C1 == nullptr);
31558 if ((C0 ? C0 : C1)->isZero())
31559 continue;
31560 }
31561 }
31562 }
31563 Result = Builder.CreateShl(Result, BitPos);
31564 break;
31565 }
31566 }
31567
31568 I->replaceAllUsesWith(Result);
31569 I->eraseFromParent();
31570 AI->eraseFromParent();
31571}
31572
31574 using namespace llvm::PatternMatch;
31575 if (!AI->hasOneUse())
31576 return false;
31577
31578 Value *Op = AI->getOperand(1);
31579 CmpPredicate Pred;
31580 Instruction *I = AI->user_back();
31582 if (Opc == AtomicRMWInst::Add) {
31583 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31584 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31585 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31586 if (match(I->user_back(),
31588 return true;
31589 if (match(I->user_back(),
31591 return true;
31592 }
31593 return false;
31594 }
31595 if (Opc == AtomicRMWInst::Sub) {
31596 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31597 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31598 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
31599 if (match(I->user_back(),
31601 return true;
31602 if (match(I->user_back(),
31604 return true;
31605 }
31606 return false;
31607 }
31608 if ((Opc == AtomicRMWInst::Or &&
31610 (Opc == AtomicRMWInst::And &&
31612 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
31613 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
31614 Pred == CmpInst::ICMP_SLT;
31615 if (match(I->user_back(),
31617 return true;
31618 return false;
31619 }
31620 if (Opc == AtomicRMWInst::Xor) {
31621 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31622 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31623 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
31624 if (match(I->user_back(),
31626 return true;
31627 if (match(I->user_back(),
31629 return true;
31630 }
31631 return false;
31632 }
31633
31634 return false;
31635}
31636
31637void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
31638 AtomicRMWInst *AI) const {
31639 IRBuilder<> Builder(AI);
31640 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31641 Instruction *TempI = nullptr;
31642 LLVMContext &Ctx = AI->getContext();
31643 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
31644 if (!ICI) {
31645 TempI = AI->user_back();
31646 assert(TempI->hasOneUse() && "Must have one use");
31647 ICI = cast<ICmpInst>(TempI->user_back());
31648 }
31650 ICmpInst::Predicate Pred = ICI->getPredicate();
31651 switch (Pred) {
31652 default:
31653 llvm_unreachable("Not supported Pred");
31654 case CmpInst::ICMP_EQ:
31655 CC = X86::COND_E;
31656 break;
31657 case CmpInst::ICMP_NE:
31658 CC = X86::COND_NE;
31659 break;
31660 case CmpInst::ICMP_SLT:
31661 CC = X86::COND_S;
31662 break;
31663 case CmpInst::ICMP_SGT:
31664 CC = X86::COND_NS;
31665 break;
31666 }
31668 switch (AI->getOperation()) {
31669 default:
31670 llvm_unreachable("Unknown atomic operation");
31671 case AtomicRMWInst::Add:
31672 IID = Intrinsic::x86_atomic_add_cc;
31673 break;
31674 case AtomicRMWInst::Sub:
31675 IID = Intrinsic::x86_atomic_sub_cc;
31676 break;
31677 case AtomicRMWInst::Or:
31678 IID = Intrinsic::x86_atomic_or_cc;
31679 break;
31680 case AtomicRMWInst::And:
31681 IID = Intrinsic::x86_atomic_and_cc;
31682 break;
31683 case AtomicRMWInst::Xor:
31684 IID = Intrinsic::x86_atomic_xor_cc;
31685 break;
31686 }
31687 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31689 Value *Call = Builder.CreateIntrinsic(
31690 IID, AI->getType(),
31691 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
31692 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
31693 ICI->replaceAllUsesWith(Result);
31694 ICI->eraseFromParent();
31695 if (TempI)
31696 TempI->eraseFromParent();
31697 AI->eraseFromParent();
31698}
31699
31701X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
31702 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31703 Type *MemType = AI->getType();
31704
31705 // If the operand is too big, we must see if cmpxchg8/16b is available
31706 // and default to library calls otherwise.
31707 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
31708 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31710 }
31711
31713 switch (Op) {
31716 case AtomicRMWInst::Add:
31717 case AtomicRMWInst::Sub:
31720 // It's better to use xadd, xsub or xchg for these in other cases.
31722 case AtomicRMWInst::Or:
31723 case AtomicRMWInst::And:
31724 case AtomicRMWInst::Xor:
31727 return shouldExpandLogicAtomicRMWInIR(AI);
31729 case AtomicRMWInst::Max:
31730 case AtomicRMWInst::Min:
31741 default:
31742 // These always require a non-trivial set of data operations on x86. We must
31743 // use a cmpxchg loop.
31745 }
31746}
31747
31748LoadInst *
31749X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
31750 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
31751 Type *MemType = AI->getType();
31752 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
31753 // there is no benefit in turning such RMWs into loads, and it is actually
31754 // harmful as it introduces a mfence.
31755 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
31756 return nullptr;
31757
31758 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
31759 // lowering available in lowerAtomicArith.
31760 // TODO: push more cases through this path.
31761 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
31762 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
31763 AI->use_empty())
31764 return nullptr;
31765
31766 IRBuilder<> Builder(AI);
31767 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31768 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
31769 auto SSID = AI->getSyncScopeID();
31770 // We must restrict the ordering to avoid generating loads with Release or
31771 // ReleaseAcquire orderings.
31773
31774 // Before the load we need a fence. Here is an example lifted from
31775 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
31776 // is required:
31777 // Thread 0:
31778 // x.store(1, relaxed);
31779 // r1 = y.fetch_add(0, release);
31780 // Thread 1:
31781 // y.fetch_add(42, acquire);
31782 // r2 = x.load(relaxed);
31783 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
31784 // lowered to just a load without a fence. A mfence flushes the store buffer,
31785 // making the optimization clearly correct.
31786 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
31787 // otherwise, we might be able to be more aggressive on relaxed idempotent
31788 // rmw. In practice, they do not look useful, so we don't try to be
31789 // especially clever.
31790 if (SSID == SyncScope::SingleThread)
31791 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
31792 // the IR level, so we must wrap it in an intrinsic.
31793 return nullptr;
31794
31795 if (!Subtarget.hasMFence())
31796 // FIXME: it might make sense to use a locked operation here but on a
31797 // different cache-line to prevent cache-line bouncing. In practice it
31798 // is probably a small win, and x86 processors without mfence are rare
31799 // enough that we do not bother.
31800 return nullptr;
31801
31802 Function *MFence =
31803 llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
31804 Builder.CreateCall(MFence, {});
31805
31806 // Finally we can emit the atomic load.
31807 LoadInst *Loaded = Builder.CreateAlignedLoad(
31808 AI->getType(), AI->getPointerOperand(), AI->getAlign());
31809 Loaded->setAtomic(Order, SSID);
31810 AI->replaceAllUsesWith(Loaded);
31811 AI->eraseFromParent();
31812 return Loaded;
31813}
31814
31815/// Emit a locked operation on a stack location which does not change any
31816/// memory location, but does involve a lock prefix. Location is chosen to be
31817/// a) very likely accessed only by a single thread to minimize cache traffic,
31818/// and b) definitely dereferenceable. Returns the new Chain result.
31820 const X86Subtarget &Subtarget, SDValue Chain,
31821 const SDLoc &DL) {
31822 // Implementation notes:
31823 // 1) LOCK prefix creates a full read/write reordering barrier for memory
31824 // operations issued by the current processor. As such, the location
31825 // referenced is not relevant for the ordering properties of the instruction.
31826 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
31827 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
31828 // 2) Using an immediate operand appears to be the best encoding choice
31829 // here since it doesn't require an extra register.
31830 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
31831 // is small enough it might just be measurement noise.)
31832 // 4) When choosing offsets, there are several contributing factors:
31833 // a) If there's no redzone, we default to TOS. (We could allocate a cache
31834 // line aligned stack object to improve this case.)
31835 // b) To minimize our chances of introducing a false dependence, we prefer
31836 // to offset the stack usage from TOS slightly.
31837 // c) To minimize concerns about cross thread stack usage - in particular,
31838 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
31839 // captures state in the TOS frame and accesses it from many threads -
31840 // we want to use an offset such that the offset is in a distinct cache
31841 // line from the TOS frame.
31842 //
31843 // For a general discussion of the tradeoffs and benchmark results, see:
31844 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
31845
31846 auto &MF = DAG.getMachineFunction();
31847 auto &TFL = *Subtarget.getFrameLowering();
31848 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
31849
31850 if (Subtarget.is64Bit()) {
31851 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31852 SDValue Ops[] = {
31853 DAG.getRegister(X86::RSP, MVT::i64), // Base
31854 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31855 DAG.getRegister(0, MVT::i64), // Index
31856 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31857 DAG.getRegister(0, MVT::i16), // Segment.
31858 Zero,
31859 Chain};
31860 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31861 MVT::Other, Ops);
31862 return SDValue(Res, 1);
31863 }
31864
31865 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
31866 SDValue Ops[] = {
31867 DAG.getRegister(X86::ESP, MVT::i32), // Base
31868 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
31869 DAG.getRegister(0, MVT::i32), // Index
31870 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
31871 DAG.getRegister(0, MVT::i16), // Segment.
31872 Zero,
31873 Chain
31874 };
31875 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
31876 MVT::Other, Ops);
31877 return SDValue(Res, 1);
31878}
31879
31881 SelectionDAG &DAG) {
31882 SDLoc dl(Op);
31883 AtomicOrdering FenceOrdering =
31884 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
31885 SyncScope::ID FenceSSID =
31886 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
31887
31888 // The only fence that needs an instruction is a sequentially-consistent
31889 // cross-thread fence.
31890 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
31891 FenceSSID == SyncScope::System) {
31892 if (Subtarget.hasMFence())
31893 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
31894
31895 SDValue Chain = Op.getOperand(0);
31896 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
31897 }
31898
31899 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31900 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
31901}
31902
31904 SelectionDAG &DAG) {
31905 MVT T = Op.getSimpleValueType();
31906 SDLoc DL(Op);
31907 unsigned Reg = 0;
31908 unsigned size = 0;
31909 switch(T.SimpleTy) {
31910 default: llvm_unreachable("Invalid value type!");
31911 case MVT::i8: Reg = X86::AL; size = 1; break;
31912 case MVT::i16: Reg = X86::AX; size = 2; break;
31913 case MVT::i32: Reg = X86::EAX; size = 4; break;
31914 case MVT::i64:
31915 assert(Subtarget.is64Bit() && "Node not type legal!");
31916 Reg = X86::RAX; size = 8;
31917 break;
31918 }
31919 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
31920 Op.getOperand(2), SDValue());
31921 SDValue Ops[] = { cpIn.getValue(0),
31922 Op.getOperand(1),
31923 Op.getOperand(3),
31924 DAG.getTargetConstant(size, DL, MVT::i8),
31925 cpIn.getValue(1) };
31926 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31927 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31929 Ops, T, MMO);
31930
31931 SDValue cpOut =
31932 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
31933 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
31934 MVT::i32, cpOut.getValue(2));
31935 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
31936
31937 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31938 cpOut, Success, EFLAGS.getValue(1));
31939}
31940
31941// Create MOVMSKB, taking into account whether we need to split for AVX1.
31943 const X86Subtarget &Subtarget) {
31944 MVT InVT = V.getSimpleValueType();
31945
31946 if (InVT == MVT::v64i8) {
31947 SDValue Lo, Hi;
31948 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31949 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
31950 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
31951 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
31952 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
31953 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
31954 DAG.getConstant(32, DL, MVT::i8));
31955 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
31956 }
31957 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
31958 SDValue Lo, Hi;
31959 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31960 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
31961 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
31962 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
31963 DAG.getConstant(16, DL, MVT::i8));
31964 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
31965 }
31966
31967 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31968}
31969
31970static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
31971 SelectionDAG &DAG) {
31972 SDValue Src = Op.getOperand(0);
31973 MVT SrcVT = Src.getSimpleValueType();
31974 MVT DstVT = Op.getSimpleValueType();
31975
31976 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
31977 // half to v32i1 and concatenating the result.
31978 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31979 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31980 assert(Subtarget.hasBWI() && "Expected BWI target");
31981 SDLoc dl(Op);
31982 SDValue Lo, Hi;
31983 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31984 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31985 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31986 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31987 }
31988
31989 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31990 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31991 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31992 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31993 SDLoc DL(Op);
31994 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31995 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31996 return DAG.getZExtOrTrunc(V, DL, DstVT);
31997 }
31998
31999 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32000 SrcVT == MVT::i64) && "Unexpected VT!");
32001
32002 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32003 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32004 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32005 // This conversion needs to be expanded.
32006 return SDValue();
32007
32008 SDLoc dl(Op);
32009 if (SrcVT.isVector()) {
32010 // Widen the vector in input in the case of MVT::v2i32.
32011 // Example: from MVT::v2i32 to MVT::v4i32.
32013 SrcVT.getVectorNumElements() * 2);
32014 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32015 DAG.getUNDEF(SrcVT));
32016 } else {
32017 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32018 "Unexpected source type in LowerBITCAST");
32019 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32020 }
32021
32022 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32023 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32024
32025 if (DstVT == MVT::x86mmx)
32026 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32027
32028 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32029 DAG.getVectorIdxConstant(0, dl));
32030}
32031
32032/// Compute the horizontal sum of bytes in V for the elements of VT.
32033///
32034/// Requires V to be a byte vector and VT to be an integer vector type with
32035/// wider elements than V's type. The width of the elements of VT determines
32036/// how many bytes of V are summed horizontally to produce each element of the
32037/// result.
32039 const X86Subtarget &Subtarget,
32040 SelectionDAG &DAG) {
32041 SDLoc DL(V);
32042 MVT ByteVecVT = V.getSimpleValueType();
32043 MVT EltVT = VT.getVectorElementType();
32044 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32045 "Expected value to have byte element type.");
32046 assert(EltVT != MVT::i8 &&
32047 "Horizontal byte sum only makes sense for wider elements!");
32048 unsigned VecSize = VT.getSizeInBits();
32049 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32050
32051 // PSADBW instruction horizontally add all bytes and leave the result in i64
32052 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32053 if (EltVT == MVT::i64) {
32054 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32055 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32056 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32057 return DAG.getBitcast(VT, V);
32058 }
32059
32060 if (EltVT == MVT::i32) {
32061 // We unpack the low half and high half into i32s interleaved with zeros so
32062 // that we can use PSADBW to horizontally sum them. The most useful part of
32063 // this is that it lines up the results of two PSADBW instructions to be
32064 // two v2i64 vectors which concatenated are the 4 population counts. We can
32065 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32066 SDValue Zeros = DAG.getConstant(0, DL, VT);
32067 SDValue V32 = DAG.getBitcast(VT, V);
32068 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32069 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32070
32071 // Do the horizontal sums into two v2i64s.
32072 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32073 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32074 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32075 DAG.getBitcast(ByteVecVT, Low), Zeros);
32076 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32077 DAG.getBitcast(ByteVecVT, High), Zeros);
32078
32079 // Merge them together.
32080 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32081 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32082 DAG.getBitcast(ShortVecVT, Low),
32083 DAG.getBitcast(ShortVecVT, High));
32084
32085 return DAG.getBitcast(VT, V);
32086 }
32087
32088 // The only element type left is i16.
32089 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32090
32091 // To obtain pop count for each i16 element starting from the pop count for
32092 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32093 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32094 // directly supported.
32095 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32096 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32097 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32098 DAG.getBitcast(ByteVecVT, V));
32099 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32100}
32101
32103 const X86Subtarget &Subtarget,
32104 SelectionDAG &DAG) {
32105 MVT VT = Op.getSimpleValueType();
32106 MVT EltVT = VT.getVectorElementType();
32107 int NumElts = VT.getVectorNumElements();
32108 (void)EltVT;
32109 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32110
32111 // Implement a lookup table in register by using an algorithm based on:
32112 // http://wm.ite.pl/articles/sse-popcount.html
32113 //
32114 // The general idea is that every lower byte nibble in the input vector is an
32115 // index into a in-register pre-computed pop count table. We then split up the
32116 // input vector in two new ones: (1) a vector with only the shifted-right
32117 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32118 // masked out higher ones) for each byte. PSHUFB is used separately with both
32119 // to index the in-register table. Next, both are added and the result is a
32120 // i8 vector where each element contains the pop count for input byte.
32121 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32122 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32123 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32124 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32125
32127 for (int i = 0; i < NumElts; ++i)
32128 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32129 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32130 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32131
32132 // High nibbles
32133 SDValue FourV = DAG.getConstant(4, DL, VT);
32134 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32135
32136 // Low nibbles
32137 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32138
32139 // The input vector is used as the shuffle mask that index elements into the
32140 // LUT. After counting low and high nibbles, add the vector to obtain the
32141 // final pop count per i8 element.
32142 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32143 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32144 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32145}
32146
32147// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32148// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32150 const X86Subtarget &Subtarget,
32151 SelectionDAG &DAG) {
32152 MVT VT = Op.getSimpleValueType();
32153 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32154 "Unknown CTPOP type to handle");
32155 SDValue Op0 = Op.getOperand(0);
32156
32157 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32158 if (Subtarget.hasVPOPCNTDQ()) {
32159 unsigned NumElems = VT.getVectorNumElements();
32160 assert((VT.getVectorElementType() == MVT::i8 ||
32161 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32162 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32163 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32164 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32165 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32166 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32167 }
32168 }
32169
32170 // Decompose 256-bit ops into smaller 128-bit ops.
32171 if (VT.is256BitVector() && !Subtarget.hasInt256())
32172 return splitVectorIntUnary(Op, DAG, DL);
32173
32174 // Decompose 512-bit ops into smaller 256-bit ops.
32175 if (VT.is512BitVector() && !Subtarget.hasBWI())
32176 return splitVectorIntUnary(Op, DAG, DL);
32177
32178 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32179 if (VT.getScalarType() != MVT::i8) {
32180 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32181 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32182 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32183 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32184 }
32185
32186 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32187 if (!Subtarget.hasSSSE3())
32188 return SDValue();
32189
32190 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32191}
32192
32193static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32194 SelectionDAG &DAG) {
32195 MVT VT = N.getSimpleValueType();
32196 SDValue Op = N.getOperand(0);
32197 SDLoc DL(N);
32198
32199 if (VT.isScalarInteger()) {
32200 // Compute the lower/upper bounds of the active bits of the value,
32201 // allowing us to shift the active bits down if necessary to fit into the
32202 // special cases below.
32203 KnownBits Known = DAG.computeKnownBits(Op);
32204 if (Known.isConstant())
32205 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32206 unsigned LZ = Known.countMinLeadingZeros();
32207 unsigned TZ = Known.countMinTrailingZeros();
32208 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32209 unsigned ActiveBits = Known.getBitWidth() - LZ;
32210 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32211
32212 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32213 if (ShiftedActiveBits <= 2) {
32214 if (ActiveBits > 2)
32215 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32216 DAG.getShiftAmountConstant(TZ, VT, DL));
32217 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32218 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32219 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32220 DAG.getShiftAmountConstant(1, VT, DL)));
32221 return DAG.getZExtOrTrunc(Op, DL, VT);
32222 }
32223
32224 // i3 CTPOP - perform LUT into i32 integer.
32225 if (ShiftedActiveBits <= 3) {
32226 if (ActiveBits > 3)
32227 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32228 DAG.getShiftAmountConstant(TZ, VT, DL));
32229 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32230 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32231 DAG.getShiftAmountConstant(1, VT, DL));
32232 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32233 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32234 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32235 DAG.getConstant(0x3, DL, MVT::i32));
32236 return DAG.getZExtOrTrunc(Op, DL, VT);
32237 }
32238
32239 // i4 CTPOP - perform LUT into i64 integer.
32240 if (ShiftedActiveBits <= 4 &&
32241 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32242 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32243 if (ActiveBits > 4)
32244 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32245 DAG.getShiftAmountConstant(TZ, VT, DL));
32246 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32247 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32248 DAG.getConstant(4, DL, MVT::i32));
32249 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32250 DAG.getShiftAmountOperand(MVT::i64, Op));
32251 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32252 DAG.getConstant(0x7, DL, MVT::i64));
32253 return DAG.getZExtOrTrunc(Op, DL, VT);
32254 }
32255
32256 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32257 if (ShiftedActiveBits <= 8) {
32258 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32259 if (ActiveBits > 8)
32260 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32261 DAG.getShiftAmountConstant(TZ, VT, DL));
32262 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32263 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32264 DAG.getConstant(0x08040201U, DL, MVT::i32));
32265 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32266 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32267 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32268 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32269 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32270 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32271 return DAG.getZExtOrTrunc(Op, DL, VT);
32272 }
32273
32274 return SDValue(); // fallback to generic expansion.
32275 }
32276
32277 assert(VT.isVector() &&
32278 "We only do custom lowering for vector population count.");
32279 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32280}
32281
32283 MVT VT = Op.getSimpleValueType();
32284 SDValue In = Op.getOperand(0);
32285 SDLoc DL(Op);
32286
32287 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32288 // perform the BITREVERSE.
32289 if (!VT.isVector()) {
32290 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32291 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32292 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32293 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32294 DAG.getVectorIdxConstant(0, DL));
32295 }
32296
32297 int NumElts = VT.getVectorNumElements();
32298 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32299
32300 // Decompose 256-bit ops into smaller 128-bit ops.
32301 if (VT.is256BitVector())
32302 return splitVectorIntUnary(Op, DAG, DL);
32303
32304 assert(VT.is128BitVector() &&
32305 "Only 128-bit vector bitreverse lowering supported.");
32306
32307 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32308 // perform the BSWAP in the shuffle.
32309 // Its best to shuffle using the second operand as this will implicitly allow
32310 // memory folding for multiple vectors.
32311 SmallVector<SDValue, 16> MaskElts;
32312 for (int i = 0; i != NumElts; ++i) {
32313 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32314 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32315 int PermuteByte = SourceByte | (2 << 5);
32316 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32317 }
32318 }
32319
32320 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32321 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32322 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32323 Res, Mask);
32324 return DAG.getBitcast(VT, Res);
32325}
32326
32328 SelectionDAG &DAG) {
32329 MVT VT = Op.getSimpleValueType();
32330
32331 if (Subtarget.hasXOP() && !VT.is512BitVector())
32332 return LowerBITREVERSE_XOP(Op, DAG);
32333
32334 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
32335
32336 SDValue In = Op.getOperand(0);
32337 SDLoc DL(Op);
32338
32339 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32340 if (VT.is512BitVector() && !Subtarget.hasBWI())
32341 return splitVectorIntUnary(Op, DAG, DL);
32342
32343 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32344 if (VT.is256BitVector() && !Subtarget.hasInt256())
32345 return splitVectorIntUnary(Op, DAG, DL);
32346
32347 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32348 if (!VT.isVector()) {
32349 assert(
32350 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32351 "Only tested for i8/i16/i32/i64");
32352 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32353 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32354 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32355 DAG.getBitcast(MVT::v16i8, Res));
32356 Res =
32357 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32358 DAG.getVectorIdxConstant(0, DL));
32359 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32360 }
32361
32362 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32363
32364 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32365 if (VT.getScalarType() != MVT::i8) {
32366 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32367 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32368 Res = DAG.getBitcast(ByteVT, Res);
32369 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32370 return DAG.getBitcast(VT, Res);
32371 }
32372 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32373 "Only byte vector BITREVERSE supported");
32374
32375 unsigned NumElts = VT.getVectorNumElements();
32376
32377 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32378 if (Subtarget.hasGFNI()) {
32380 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32381 DAG.getTargetConstant(0, DL, MVT::i8));
32382 }
32383
32384 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32385 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32386 // 0-15 value (moved to the other nibble).
32387 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32388 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32389 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32390
32391 const int LoLUT[16] = {
32392 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32393 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32394 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32395 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32396 const int HiLUT[16] = {
32397 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32398 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32399 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32400 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32401
32402 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32403 for (unsigned i = 0; i < NumElts; ++i) {
32404 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32405 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32406 }
32407
32408 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32409 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32410 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32411 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32412 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32413}
32414
32415static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32416 SelectionDAG &DAG) {
32417 SDLoc DL(Op);
32418 SDValue X = Op.getOperand(0);
32419 MVT VT = Op.getSimpleValueType();
32420
32421 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32422 if (VT == MVT::i8 ||
32424 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32425 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32426 DAG.getConstant(0, DL, MVT::i8));
32427 // Copy the inverse of the parity flag into a register with setcc.
32428 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32429 // Extend to the original type.
32430 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32431 }
32432
32433 // If we have POPCNT, use the default expansion.
32434 if (Subtarget.hasPOPCNT())
32435 return SDValue();
32436
32437 if (VT == MVT::i64) {
32438 // Xor the high and low 16-bits together using a 32-bit operation.
32439 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32440 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32441 DAG.getConstant(32, DL, MVT::i8)));
32442 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32443 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32444 }
32445
32446 if (VT != MVT::i16) {
32447 // Xor the high and low 16-bits together using a 32-bit operation.
32448 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32449 DAG.getConstant(16, DL, MVT::i8));
32450 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32451 } else {
32452 // If the input is 16-bits, we need to extend to use an i32 shift below.
32453 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32454 }
32455
32456 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32457 // This should allow an h-reg to be used to save a shift.
32458 SDValue Hi = DAG.getNode(
32459 ISD::TRUNCATE, DL, MVT::i8,
32460 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32461 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32462 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32463 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32464
32465 // Copy the inverse of the parity flag into a register with setcc.
32466 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32467 // Extend to the original type.
32468 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32469}
32470
32472 const X86Subtarget &Subtarget) {
32473 unsigned NewOpc = 0;
32474 switch (N->getOpcode()) {
32476 NewOpc = X86ISD::LADD;
32477 break;
32479 NewOpc = X86ISD::LSUB;
32480 break;
32482 NewOpc = X86ISD::LOR;
32483 break;
32485 NewOpc = X86ISD::LXOR;
32486 break;
32488 NewOpc = X86ISD::LAND;
32489 break;
32490 default:
32491 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32492 }
32493
32494 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32495
32496 return DAG.getMemIntrinsicNode(
32497 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32498 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32499 /*MemVT=*/N->getSimpleValueType(0), MMO);
32500}
32501
32502/// Lower atomic_load_ops into LOCK-prefixed operations.
32504 const X86Subtarget &Subtarget) {
32505 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32506 SDValue Chain = N->getOperand(0);
32507 SDValue LHS = N->getOperand(1);
32508 SDValue RHS = N->getOperand(2);
32509 unsigned Opc = N->getOpcode();
32510 MVT VT = N->getSimpleValueType(0);
32511 SDLoc DL(N);
32512
32513 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32514 // can only be lowered when the result is unused. They should have already
32515 // been transformed into a cmpxchg loop in AtomicExpand.
32516 if (N->hasAnyUseOfValue(0)) {
32517 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32518 // select LXADD if LOCK_SUB can't be selected.
32519 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32520 // can use LXADD as opposed to cmpxchg.
32521 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32523 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32524 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32525
32527 "Used AtomicRMW ops other than Add should have been expanded!");
32528 return N;
32529 }
32530
32531 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32532 // The core idea here is that since the memory location isn't actually
32533 // changing, all we need is a lowering for the *ordering* impacts of the
32534 // atomicrmw. As such, we can chose a different operation and memory
32535 // location to minimize impact on other code.
32536 // The above holds unless the node is marked volatile in which
32537 // case it needs to be preserved according to the langref.
32538 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32539 // On X86, the only ordering which actually requires an instruction is
32540 // seq_cst which isn't SingleThread, everything just needs to be preserved
32541 // during codegen and then dropped. Note that we expect (but don't assume),
32542 // that orderings other than seq_cst and acq_rel have been canonicalized to
32543 // a store or load.
32546 // Prefer a locked operation against a stack location to minimize cache
32547 // traffic. This assumes that stack locations are very likely to be
32548 // accessed only by the owning thread.
32549 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32550 assert(!N->hasAnyUseOfValue(0));
32551 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32552 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32553 DAG.getUNDEF(VT), NewChain);
32554 }
32555 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32556 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32557 assert(!N->hasAnyUseOfValue(0));
32558 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32559 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32560 DAG.getUNDEF(VT), NewChain);
32561 }
32562
32563 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32564 // RAUW the chain, but don't worry about the result, as it's unused.
32565 assert(!N->hasAnyUseOfValue(0));
32566 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32567 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32568 DAG.getUNDEF(VT), LockOp.getValue(1));
32569}
32570
32572 const X86Subtarget &Subtarget) {
32573 auto *Node = cast<AtomicSDNode>(Op.getNode());
32574 SDLoc dl(Node);
32575 EVT VT = Node->getMemoryVT();
32576
32577 bool IsSeqCst =
32578 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32579 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32580
32581 // If this store is not sequentially consistent and the type is legal
32582 // we can just keep it.
32583 if (!IsSeqCst && IsTypeLegal)
32584 return Op;
32585
32586 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32588 Attribute::NoImplicitFloat)) {
32589 SDValue Chain;
32590 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32591 // vector store.
32592 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32593 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32594 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32595 Node->getMemOperand());
32596 }
32597
32598 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32599 // is enabled.
32600 if (VT == MVT::i64) {
32601 if (Subtarget.hasSSE1()) {
32602 SDValue SclToVec =
32603 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
32604 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32605 SclToVec = DAG.getBitcast(StVT, SclToVec);
32606 SDVTList Tys = DAG.getVTList(MVT::Other);
32607 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
32608 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
32609 MVT::i64, Node->getMemOperand());
32610 } else if (Subtarget.hasX87()) {
32611 // First load this into an 80-bit X87 register using a stack temporary.
32612 // This will put the whole integer into the significand.
32613 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32614 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32615 MachinePointerInfo MPI =
32617 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
32619 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32620 SDValue LdOps[] = {Chain, StackPtr};
32622 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
32623 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
32624 Chain = Value.getValue(1);
32625
32626 // Now use an FIST to do the atomic store.
32627 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
32628 Chain =
32629 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
32630 StoreOps, MVT::i64, Node->getMemOperand());
32631 }
32632 }
32633
32634 if (Chain) {
32635 // If this is a sequentially consistent store, also emit an appropriate
32636 // barrier.
32637 if (IsSeqCst)
32638 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
32639
32640 return Chain;
32641 }
32642 }
32643
32644 // Convert seq_cst store -> xchg
32645 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
32646 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
32647 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
32648 Node->getOperand(0), Node->getOperand(2),
32649 Node->getOperand(1), Node->getMemOperand());
32650 return Swap.getValue(1);
32651}
32652
32654 SDNode *N = Op.getNode();
32655 MVT VT = N->getSimpleValueType(0);
32656 unsigned Opc = Op.getOpcode();
32657
32658 // Let legalize expand this if it isn't a legal type yet.
32659 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32660 return SDValue();
32661
32662 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
32663 SDLoc DL(N);
32664
32665 // Set the carry flag.
32666 SDValue Carry = Op.getOperand(2);
32667 EVT CarryVT = Carry.getValueType();
32668 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
32669 Carry, DAG.getAllOnesConstant(DL, CarryVT));
32670
32671 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
32672 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
32673 Op.getOperand(0), Op.getOperand(1),
32674 Carry.getValue(1));
32675
32676 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
32677 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
32678 Sum.getValue(1), DL, DAG);
32679 if (N->getValueType(1) == MVT::i1)
32680 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
32681
32682 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
32683}
32684
32685static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
32686 SelectionDAG &DAG) {
32687 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
32688
32689 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
32690 // which returns the values as { float, float } (in XMM0) or
32691 // { double, double } (which is returned in XMM0, XMM1).
32692 SDLoc dl(Op);
32693 SDValue Arg = Op.getOperand(0);
32694 EVT ArgVT = Arg.getValueType();
32695 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
32696
32699
32700 Entry.Node = Arg;
32701 Entry.Ty = ArgTy;
32702 Entry.IsSExt = false;
32703 Entry.IsZExt = false;
32704 Args.push_back(Entry);
32705
32706 bool isF64 = ArgVT == MVT::f64;
32707 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
32708 // the small struct {f32, f32} is returned in (eax, edx). For f64,
32709 // the results are returned via SRet in memory.
32710 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32711 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
32712 const char *LibcallName = TLI.getLibcallName(LC);
32713 SDValue Callee =
32714 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
32715
32716 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
32717 : (Type *)FixedVectorType::get(ArgTy, 4);
32718
32720 CLI.setDebugLoc(dl)
32721 .setChain(DAG.getEntryNode())
32722 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
32723
32724 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
32725
32726 if (isF64)
32727 // Returned in xmm0 and xmm1.
32728 return CallResult.first;
32729
32730 // Returned in bits 0:31 and 32:64 xmm0.
32731 SDValue SinVal =
32732 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
32733 DAG.getVectorIdxConstant(0, dl));
32734 SDValue CosVal =
32735 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
32736 DAG.getVectorIdxConstant(1, dl));
32737 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
32738 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
32739}
32740
32741/// Widen a vector input to a vector of NVT. The
32742/// input vector must have the same element type as NVT.
32744 bool FillWithZeroes = false) {
32745 // Check if InOp already has the right width.
32746 MVT InVT = InOp.getSimpleValueType();
32747 if (InVT == NVT)
32748 return InOp;
32749
32750 if (InOp.isUndef())
32751 return DAG.getUNDEF(NVT);
32752
32754 "input and widen element type must match");
32755
32756 unsigned InNumElts = InVT.getVectorNumElements();
32757 unsigned WidenNumElts = NVT.getVectorNumElements();
32758 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
32759 "Unexpected request for vector widening");
32760
32761 SDLoc dl(InOp);
32762 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
32763 SDValue N1 = InOp.getOperand(1);
32764 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
32765 N1.isUndef()) {
32766 InOp = InOp.getOperand(0);
32767 InVT = InOp.getSimpleValueType();
32768 InNumElts = InVT.getVectorNumElements();
32769 }
32770 }
32773 EVT EltVT = InOp.getOperand(0).getValueType();
32774 SDValue FillVal =
32775 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
32776 SmallVector<SDValue, 16> Ops(InOp->op_begin(), InOp->op_end());
32777 Ops.append(WidenNumElts - InNumElts, FillVal);
32778 return DAG.getBuildVector(NVT, dl, Ops);
32779 }
32780 SDValue FillVal =
32781 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
32782 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
32783 DAG.getVectorIdxConstant(0, dl));
32784}
32785
32787 SelectionDAG &DAG) {
32788 assert(Subtarget.hasAVX512() &&
32789 "MGATHER/MSCATTER are supported on AVX-512 arch only");
32790
32791 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
32792 SDValue Src = N->getValue();
32793 MVT VT = Src.getSimpleValueType();
32794 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
32795 SDLoc dl(Op);
32796
32797 SDValue Scale = N->getScale();
32798 SDValue Index = N->getIndex();
32799 SDValue Mask = N->getMask();
32800 SDValue Chain = N->getChain();
32801 SDValue BasePtr = N->getBasePtr();
32802
32803 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
32804 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
32805 // If the index is v2i64 and we have VLX we can use xmm for data and index.
32806 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
32807 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32808 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
32809 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
32810 SDVTList VTs = DAG.getVTList(MVT::Other);
32811 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32812 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32813 N->getMemoryVT(), N->getMemOperand());
32814 }
32815 return SDValue();
32816 }
32817
32818 MVT IndexVT = Index.getSimpleValueType();
32819
32820 // If the index is v2i32, we're being called by type legalization and we
32821 // should just let the default handling take care of it.
32822 if (IndexVT == MVT::v2i32)
32823 return SDValue();
32824
32825 // If we don't have VLX and neither the passthru or index is 512-bits, we
32826 // need to widen until one is.
32827 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
32828 !Index.getSimpleValueType().is512BitVector()) {
32829 // Determine how much we need to widen by to get a 512-bit type.
32830 unsigned Factor = std::min(512/VT.getSizeInBits(),
32831 512/IndexVT.getSizeInBits());
32832 unsigned NumElts = VT.getVectorNumElements() * Factor;
32833
32834 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32835 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32836 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32837
32838 Src = ExtendToType(Src, VT, DAG);
32839 Index = ExtendToType(Index, IndexVT, DAG);
32840 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32841 }
32842
32843 SDVTList VTs = DAG.getVTList(MVT::Other);
32844 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
32845 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
32846 N->getMemoryVT(), N->getMemOperand());
32847}
32848
32849static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
32850 SelectionDAG &DAG) {
32851
32852 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
32853 MVT VT = Op.getSimpleValueType();
32854 MVT ScalarVT = VT.getScalarType();
32855 SDValue Mask = N->getMask();
32856 MVT MaskVT = Mask.getSimpleValueType();
32857 SDValue PassThru = N->getPassThru();
32858 SDLoc dl(Op);
32859
32860 // Handle AVX masked loads which don't support passthru other than 0.
32861 if (MaskVT.getVectorElementType() != MVT::i1) {
32862 // We also allow undef in the isel pattern.
32863 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
32864 return Op;
32865
32866 SDValue NewLoad = DAG.getMaskedLoad(
32867 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32868 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
32869 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
32870 N->isExpandingLoad());
32871 // Emit a blend.
32872 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
32873 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
32874 }
32875
32876 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
32877 "Expanding masked load is supported on AVX-512 target only!");
32878
32879 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
32880 "Expanding masked load is supported for 32 and 64-bit types only!");
32881
32882 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32883 "Cannot lower masked load op.");
32884
32885 assert((ScalarVT.getSizeInBits() >= 32 ||
32886 (Subtarget.hasBWI() &&
32887 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32888 "Unsupported masked load op.");
32889
32890 // This operation is legal for targets with VLX, but without
32891 // VLX the vector should be widened to 512 bit
32892 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
32893 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32894 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
32895
32896 // Mask element has to be i1.
32897 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32898 "Unexpected mask type");
32899
32900 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32901
32902 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32903 SDValue NewLoad = DAG.getMaskedLoad(
32904 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32905 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32906 N->getExtensionType(), N->isExpandingLoad());
32907
32908 SDValue Extract =
32909 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
32910 DAG.getVectorIdxConstant(0, dl));
32911 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
32912 return DAG.getMergeValues(RetOps, dl);
32913}
32914
32915static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
32916 SelectionDAG &DAG) {
32917 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
32918 SDValue DataToStore = N->getValue();
32919 MVT VT = DataToStore.getSimpleValueType();
32920 MVT ScalarVT = VT.getScalarType();
32921 SDValue Mask = N->getMask();
32922 SDLoc dl(Op);
32923
32924 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32925 "Expanding masked load is supported on AVX-512 target only!");
32926
32927 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32928 "Expanding masked load is supported for 32 and 64-bit types only!");
32929
32930 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32931 "Cannot lower masked store op.");
32932
32933 assert((ScalarVT.getSizeInBits() >= 32 ||
32934 (Subtarget.hasBWI() &&
32935 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32936 "Unsupported masked store op.");
32937
32938 // This operation is legal for targets with VLX, but without
32939 // VLX the vector should be widened to 512 bit
32940 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
32941 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32942
32943 // Mask element has to be i1.
32944 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32945 "Unexpected mask type");
32946
32947 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32948
32949 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
32950 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32951 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32952 N->getOffset(), Mask, N->getMemoryVT(),
32953 N->getMemOperand(), N->getAddressingMode(),
32954 N->isTruncatingStore(), N->isCompressingStore());
32955}
32956
32957static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
32958 SelectionDAG &DAG) {
32959 assert(Subtarget.hasAVX2() &&
32960 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32961
32962 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
32963 SDLoc dl(Op);
32964 MVT VT = Op.getSimpleValueType();
32965 SDValue Index = N->getIndex();
32966 SDValue Mask = N->getMask();
32967 SDValue PassThru = N->getPassThru();
32968 MVT IndexVT = Index.getSimpleValueType();
32969
32970 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
32971
32972 // If the index is v2i32, we're being called by type legalization.
32973 if (IndexVT == MVT::v2i32)
32974 return SDValue();
32975
32976 // If we don't have VLX and neither the passthru or index is 512-bits, we
32977 // need to widen until one is.
32978 MVT OrigVT = VT;
32979 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32980 !IndexVT.is512BitVector()) {
32981 // Determine how much we need to widen by to get a 512-bit type.
32982 unsigned Factor = std::min(512/VT.getSizeInBits(),
32983 512/IndexVT.getSizeInBits());
32984
32985 unsigned NumElts = VT.getVectorNumElements() * Factor;
32986
32987 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32988 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32989 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32990
32991 PassThru = ExtendToType(PassThru, VT, DAG);
32992 Index = ExtendToType(Index, IndexVT, DAG);
32993 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32994 }
32995
32996 // Break dependency on the data register.
32997 if (PassThru.isUndef())
32998 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
32999
33000 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33001 N->getScale() };
33002 SDValue NewGather = DAG.getMemIntrinsicNode(
33003 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33004 N->getMemOperand());
33005 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33006 DAG.getVectorIdxConstant(0, dl));
33007 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33008}
33009
33011 SDLoc dl(Op);
33012 SDValue Src = Op.getOperand(0);
33013 MVT DstVT = Op.getSimpleValueType();
33014
33015 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
33016 unsigned SrcAS = N->getSrcAddressSpace();
33017
33018 assert(SrcAS != N->getDestAddressSpace() &&
33019 "addrspacecast must be between different address spaces");
33020
33021 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33022 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33023 } else if (DstVT == MVT::i64) {
33024 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33025 } else if (DstVT == MVT::i32) {
33026 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33027 } else {
33028 report_fatal_error("Bad address space in addrspacecast");
33029 }
33030 return Op;
33031}
33032
33033SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33034 SelectionDAG &DAG) const {
33035 // TODO: Eventually, the lowering of these nodes should be informed by or
33036 // deferred to the GC strategy for the function in which they appear. For
33037 // now, however, they must be lowered to something. Since they are logically
33038 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33039 // require special handling for these nodes), lower them as literal NOOPs for
33040 // the time being.
33042 Ops.push_back(Op.getOperand(0));
33043 if (Op->getGluedNode())
33044 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33045
33046 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33047 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33048}
33049
33050// Custom split CVTPS2PH with wide types.
33052 SDLoc dl(Op);
33053 EVT VT = Op.getValueType();
33054 SDValue Lo, Hi;
33055 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33056 EVT LoVT, HiVT;
33057 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33058 SDValue RC = Op.getOperand(1);
33059 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33060 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33061 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33062}
33063
33065 SelectionDAG &DAG) {
33066 unsigned IsData = Op.getConstantOperandVal(4);
33067
33068 // We don't support non-data prefetch without PREFETCHI.
33069 // Just preserve the chain.
33070 if (!IsData && !Subtarget.hasPREFETCHI())
33071 return Op.getOperand(0);
33072
33073 return Op;
33074}
33075
33077 SDNode *N = Op.getNode();
33078 SDValue Operand = N->getOperand(0);
33079 EVT VT = Operand.getValueType();
33080 SDLoc dl(N);
33081
33082 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33083
33084 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33085 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33086 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33087 // promote this operator's result!
33088 SDValue Chain = DAG.getEntryNode();
33089 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33090 {Chain, Operand, One});
33091 return StrictFmul;
33092}
33093
33095 unsigned OpNo) {
33096 const APInt Operand(32, OpNo);
33097 std::string OpNoStr = llvm::toString(Operand, 10, false);
33098 std::string Str(" $");
33099
33100 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33101 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33102
33103 auto I = StringRef::npos;
33104 for (auto &AsmStr : AsmStrs) {
33105 // Match the OpNo string. We should match exactly to exclude match
33106 // sub-string, e.g. "$12" contain "$1"
33107 if (AsmStr.ends_with(OpNoStr1))
33108 I = AsmStr.size() - OpNoStr1.size();
33109
33110 // Get the index of operand in AsmStr.
33111 if (I == StringRef::npos)
33112 I = AsmStr.find(OpNoStr1 + ",");
33113 if (I == StringRef::npos)
33114 I = AsmStr.find(OpNoStr2);
33115
33116 if (I == StringRef::npos)
33117 continue;
33118
33119 assert(I > 0 && "Unexpected inline asm string!");
33120 // Remove the operand string and label (if exsit).
33121 // For example:
33122 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33123 // ==>
33124 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33125 // ==>
33126 // "call dword ptr "
33127 auto TmpStr = AsmStr.substr(0, I);
33128 I = TmpStr.rfind(':');
33129 if (I != StringRef::npos)
33130 TmpStr = TmpStr.substr(I + 1);
33131 return TmpStr.take_while(llvm::isAlpha);
33132 }
33133
33134 return StringRef();
33135}
33136
33138 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33139 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33140 // changed from indirect TargetLowering::C_Memory to direct
33141 // TargetLowering::C_Address.
33142 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33143 // location.
33144 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33145 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33146}
33147
33149 SDValue Mask) {
33150 EVT Ty = MVT::i8;
33151 auto V = DAG.getBitcast(MVT::i1, Mask);
33152 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33153 auto Zero = DAG.getConstant(0, DL, Ty);
33154 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33155 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33156 return SDValue(CmpZero.getNode(), 1);
33157}
33158
33160 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33161 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33162 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33163 // ->
33164 // _, flags = SUB 0, mask
33165 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33166 // bit_cast_to_vector<res>
33167 EVT VTy = PassThru.getValueType();
33168 EVT Ty = VTy.getVectorElementType();
33169 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33170 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33171 : DAG.getBitcast(Ty, PassThru);
33172 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33173 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33174 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33175 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33176 return DAG.getBitcast(VTy, NewLoad);
33177}
33178
33180 SDValue Chain,
33182 SDValue Val, SDValue Mask) const {
33183 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33184 // ->
33185 // _, flags = SUB 0, mask
33186 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33188 SDVTList Tys = DAG.getVTList(MVT::Other);
33189 auto ScalarVal = DAG.getBitcast(Ty, Val);
33190 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33191 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33192 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33193 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33194}
33195
33196/// Provide custom lowering hooks for some operations.
33198 switch (Op.getOpcode()) {
33199 // clang-format off
33200 default: llvm_unreachable("Should not custom lower this!");
33201 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33203 return LowerCMP_SWAP(Op, Subtarget, DAG);
33204 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33209 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33210 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33211 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33212 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33213 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33214 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33215 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33216 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33217 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33218 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33219 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33220 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33221 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33222 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33223 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33224 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33225 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33226 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33227 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33228 case ISD::SHL_PARTS:
33229 case ISD::SRA_PARTS:
33230 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33231 case ISD::FSHL:
33232 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33233 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33235 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33237 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33238 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33239 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33240 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33241 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33244 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33245 case ISD::FP_TO_SINT:
33247 case ISD::FP_TO_UINT:
33248 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33250 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33251 case ISD::FP_EXTEND:
33252 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33253 case ISD::FP_ROUND:
33254 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33255 case ISD::FP16_TO_FP:
33256 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33257 case ISD::FP_TO_FP16:
33258 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33259 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33260 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33261 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33262 case ISD::FADD:
33263 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33264 case ISD::FROUND: return LowerFROUND(Op, DAG);
33265 case ISD::FABS:
33266 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33267 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33268 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33269 case ISD::LRINT:
33270 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33271 case ISD::SETCC:
33272 case ISD::STRICT_FSETCC:
33273 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33274 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33275 case ISD::SELECT: return LowerSELECT(Op, DAG);
33276 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33277 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33278 case ISD::VASTART: return LowerVASTART(Op, DAG);
33279 case ISD::VAARG: return LowerVAARG(Op, DAG);
33280 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33281 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33283 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33284 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33285 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33286 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33288 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33289 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33290 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33291 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33292 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33294 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33295 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33297 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33298 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33299 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33300 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33301 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33302 case ISD::CTLZ:
33303 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33304 case ISD::CTTZ:
33305 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33306 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33307 case ISD::MULHS:
33308 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33309 case ISD::ROTL:
33310 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33311 case ISD::SRA:
33312 case ISD::SRL:
33313 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33314 case ISD::SADDO:
33315 case ISD::UADDO:
33316 case ISD::SSUBO:
33317 case ISD::USUBO: return LowerXALUO(Op, DAG);
33318 case ISD::SMULO:
33319 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33320 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33321 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33322 case ISD::SADDO_CARRY:
33323 case ISD::SSUBO_CARRY:
33324 case ISD::UADDO_CARRY:
33325 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33326 case ISD::ADD:
33327 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33328 case ISD::UADDSAT:
33329 case ISD::SADDSAT:
33330 case ISD::USUBSAT:
33331 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33332 case ISD::SMAX:
33333 case ISD::SMIN:
33334 case ISD::UMAX:
33335 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33336 case ISD::FMINIMUM:
33337 case ISD::FMAXIMUM:
33338 case ISD::FMINIMUMNUM:
33339 case ISD::FMAXIMUMNUM:
33340 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33341 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33342 case ISD::ABDS:
33343 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33344 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33345 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33346 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33347 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33348 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33349 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33351 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33352 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33353 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33354 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33355 // clang-format on
33356 }
33357}
33358
33359/// Replace a node with an illegal result type with a new node built out of
33360/// custom code.
33363 SelectionDAG &DAG) const {
33364 SDLoc dl(N);
33365 unsigned Opc = N->getOpcode();
33366 switch (Opc) {
33367 default:
33368#ifndef NDEBUG
33369 dbgs() << "ReplaceNodeResults: ";
33370 N->dump(&DAG);
33371#endif
33372 llvm_unreachable("Do not know how to custom type legalize this operation!");
33373 case X86ISD::CVTPH2PS: {
33374 EVT VT = N->getValueType(0);
33375 SDValue Lo, Hi;
33376 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33377 EVT LoVT, HiVT;
33378 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33379 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33380 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33381 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33382 Results.push_back(Res);
33383 return;
33384 }
33386 EVT VT = N->getValueType(0);
33387 SDValue Lo, Hi;
33388 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33389 EVT LoVT, HiVT;
33390 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33391 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33392 {N->getOperand(0), Lo});
33393 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33394 {N->getOperand(0), Hi});
33395 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33396 Lo.getValue(1), Hi.getValue(1));
33397 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33398 Results.push_back(Res);
33399 Results.push_back(Chain);
33400 return;
33401 }
33402 case X86ISD::CVTPS2PH:
33403 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33404 return;
33405 case ISD::CTPOP: {
33406 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33407 // If we have at most 32 active bits, then perform as i32 CTPOP.
33408 // TODO: Perform this in generic legalizer?
33409 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33410 unsigned LZ = Known.countMinLeadingZeros();
33411 unsigned TZ = Known.countMinTrailingZeros();
33412 if ((LZ + TZ) >= 32) {
33413 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33414 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33415 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33416 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33417 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33418 Results.push_back(Op);
33419 return;
33420 }
33421 // Use a v2i64 if possible.
33422 bool NoImplicitFloatOps =
33424 Attribute::NoImplicitFloat);
33425 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33426 SDValue Wide =
33427 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33428 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33429 // Bit count should fit in 32-bits, extract it as that and then zero
33430 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33431 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33432 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33433 DAG.getVectorIdxConstant(0, dl));
33434 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33435 Results.push_back(Wide);
33436 }
33437 return;
33438 }
33439 case ISD::MUL: {
33440 EVT VT = N->getValueType(0);
33442 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33443 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33444 // elements are needed.
33445 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33446 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33447 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33448 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33449 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33450 unsigned NumConcats = 16 / VT.getVectorNumElements();
33451 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33452 ConcatOps[0] = Res;
33453 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33454 Results.push_back(Res);
33455 return;
33456 }
33457 case ISD::SMULO:
33458 case ISD::UMULO: {
33459 EVT VT = N->getValueType(0);
33461 VT == MVT::v2i32 && "Unexpected VT!");
33462 bool IsSigned = Opc == ISD::SMULO;
33463 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33464 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33465 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33466 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33467 // Extract the high 32 bits from each result using PSHUFD.
33468 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33469 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33470 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33471 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33472 DAG.getVectorIdxConstant(0, dl));
33473
33474 // Truncate the low bits of the result. This will become PSHUFD.
33475 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33476
33477 SDValue HiCmp;
33478 if (IsSigned) {
33479 // SMULO overflows if the high bits don't match the sign of the low.
33480 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33481 } else {
33482 // UMULO overflows if the high bits are non-zero.
33483 HiCmp = DAG.getConstant(0, dl, VT);
33484 }
33485 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33486
33487 // Widen the result with by padding with undef.
33488 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33489 DAG.getUNDEF(VT));
33490 Results.push_back(Res);
33491 Results.push_back(Ovf);
33492 return;
33493 }
33494 case X86ISD::VPMADDWD: {
33495 // Legalize types for X86ISD::VPMADDWD by widening.
33496 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33497
33498 EVT VT = N->getValueType(0);
33499 EVT InVT = N->getOperand(0).getValueType();
33500 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33501 "Expected a VT that divides into 128 bits.");
33503 "Unexpected type action!");
33504 unsigned NumConcat = 128 / InVT.getSizeInBits();
33505
33506 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33507 InVT.getVectorElementType(),
33508 NumConcat * InVT.getVectorNumElements());
33509 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33511 NumConcat * VT.getVectorNumElements());
33512
33513 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33514 Ops[0] = N->getOperand(0);
33515 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33516 Ops[0] = N->getOperand(1);
33517 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33518
33519 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33520 Results.push_back(Res);
33521 return;
33522 }
33523 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33524 case X86ISD::FMINC:
33525 case X86ISD::FMIN:
33526 case X86ISD::FMAXC:
33527 case X86ISD::FMAX:
33529 case X86ISD::STRICT_FMAX: {
33530 EVT VT = N->getValueType(0);
33531 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33532 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33533 SDValue UNDEF = DAG.getUNDEF(VT);
33534 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33535 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33536 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33537 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33538 SDValue Res;
33539 if (IsStrict)
33540 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33541 {N->getOperand(0), LHS, RHS});
33542 else
33543 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33544 Results.push_back(Res);
33545 if (IsStrict)
33546 Results.push_back(Res.getValue(1));
33547 return;
33548 }
33549 case ISD::SDIV:
33550 case ISD::UDIV:
33551 case ISD::SREM:
33552 case ISD::UREM: {
33553 EVT VT = N->getValueType(0);
33554 if (VT.isVector()) {
33556 "Unexpected type action!");
33557 // If this RHS is a constant splat vector we can widen this and let
33558 // division/remainder by constant optimize it.
33559 // TODO: Can we do something for non-splat?
33560 APInt SplatVal;
33561 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33562 unsigned NumConcats = 128 / VT.getSizeInBits();
33563 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33564 Ops0[0] = N->getOperand(0);
33565 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33566 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33567 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33568 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33569 Results.push_back(Res);
33570 }
33571 return;
33572 }
33573
33574 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33575 Results.push_back(V);
33576 return;
33577 }
33578 case ISD::TRUNCATE: {
33579 MVT VT = N->getSimpleValueType(0);
33580 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33581 return;
33582
33583 // The generic legalizer will try to widen the input type to the same
33584 // number of elements as the widened result type. But this isn't always
33585 // the best thing so do some custom legalization to avoid some cases.
33586 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33587 SDValue In = N->getOperand(0);
33588 EVT InVT = In.getValueType();
33589 EVT InEltVT = InVT.getVectorElementType();
33590 EVT EltVT = VT.getVectorElementType();
33591 unsigned MinElts = VT.getVectorNumElements();
33592 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33593 unsigned InBits = InVT.getSizeInBits();
33594
33595 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33596 unsigned PackOpcode;
33597 if (SDValue Src =
33598 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
33599 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
33600 dl, DAG, Subtarget)) {
33601 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
33602 Results.push_back(Res);
33603 return;
33604 }
33605 }
33606
33607 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
33608 // 128 bit and smaller inputs should avoid truncate all together and
33609 // use a shuffle.
33610 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
33611 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
33612 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
33613 for (unsigned I = 0; I < MinElts; ++I)
33614 TruncMask[I] = Scale * I;
33615 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
33616 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
33617 "Illegal vector type in truncation");
33618 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
33619 Results.push_back(
33620 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
33621 return;
33622 }
33623 }
33624
33625 // With AVX512 there are some cases that can use a target specific
33626 // truncate node to go from 256/512 to less than 128 with zeros in the
33627 // upper elements of the 128 bit result.
33628 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
33629 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
33630 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
33631 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33632 return;
33633 }
33634 // There's one case we can widen to 512 bits and use VTRUNC.
33635 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
33636 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
33637 DAG.getUNDEF(MVT::v4i64));
33638 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
33639 return;
33640 }
33641 }
33642 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
33643 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
33644 isTypeLegal(MVT::v4i64)) {
33645 // Input needs to be split and output needs to widened. Let's use two
33646 // VTRUNCs, and shuffle their results together into the wider type.
33647 SDValue Lo, Hi;
33648 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
33649
33650 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
33651 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
33652 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
33653 { 0, 1, 2, 3, 16, 17, 18, 19,
33654 -1, -1, -1, -1, -1, -1, -1, -1 });
33655 Results.push_back(Res);
33656 return;
33657 }
33658
33659 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
33660 // this via type legalization.
33661 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
33662 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
33663 (!Subtarget.hasSSSE3() ||
33664 (!isTypeLegal(InVT) &&
33665 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
33666 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
33667 InEltVT.getSizeInBits() * WidenNumElts);
33668 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
33669 return;
33670 }
33671
33672 return;
33673 }
33674 case ISD::ANY_EXTEND:
33675 // Right now, only MVT::v8i8 has Custom action for an illegal type.
33676 // It's intended to custom handle the input type.
33677 assert(N->getValueType(0) == MVT::v8i8 &&
33678 "Do not know how to legalize this Node");
33679 return;
33680 case ISD::SIGN_EXTEND:
33681 case ISD::ZERO_EXTEND: {
33682 EVT VT = N->getValueType(0);
33683 SDValue In = N->getOperand(0);
33684 EVT InVT = In.getValueType();
33685 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
33686 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
33688 "Unexpected type action!");
33689 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
33690 // Custom split this so we can extend i8/i16->i32 invec. This is better
33691 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
33692 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
33693 // we allow the sra from the extend to i32 to be shared by the split.
33694 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
33695
33696 // Fill a vector with sign bits for each element.
33697 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
33698 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
33699
33700 // Create an unpackl and unpackh to interleave the sign bits then bitcast
33701 // to v2i64.
33702 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33703 {0, 4, 1, 5});
33704 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
33705 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
33706 {2, 6, 3, 7});
33707 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
33708
33709 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33710 Results.push_back(Res);
33711 return;
33712 }
33713
33714 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
33715 if (!InVT.is128BitVector()) {
33716 // Not a 128 bit vector, but maybe type legalization will promote
33717 // it to 128 bits.
33718 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
33719 return;
33720 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
33721 if (!InVT.is128BitVector())
33722 return;
33723
33724 // Promote the input to 128 bits. Type legalization will turn this into
33725 // zext_inreg/sext_inreg.
33726 In = DAG.getNode(Opc, dl, InVT, In);
33727 }
33728
33729 // Perform custom splitting instead of the two stage extend we would get
33730 // by default.
33731 EVT LoVT, HiVT;
33732 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
33733 assert(isTypeLegal(LoVT) && "Split VT not legal?");
33734
33735 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
33736
33737 // We need to shift the input over by half the number of elements.
33738 unsigned NumElts = InVT.getVectorNumElements();
33739 unsigned HalfNumElts = NumElts / 2;
33740 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
33741 for (unsigned i = 0; i != HalfNumElts; ++i)
33742 ShufMask[i] = i + HalfNumElts;
33743
33744 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
33745 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
33746
33747 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33748 Results.push_back(Res);
33749 }
33750 return;
33751 }
33753 case ISD::FP_TO_UINT_SAT: {
33754 if (!Subtarget.hasAVX10_2())
33755 return;
33756
33757 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
33758 EVT VT = N->getValueType(0);
33759 SDValue Op = N->getOperand(0);
33760 EVT OpVT = Op.getValueType();
33761 SDValue Res;
33762
33763 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
33764 if (IsSigned)
33765 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
33766 else
33767 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
33768 Results.push_back(Res);
33769 }
33770 return;
33771 }
33772 case ISD::FP_TO_SINT:
33774 case ISD::FP_TO_UINT:
33776 bool IsStrict = N->isStrictFPOpcode();
33777 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
33778 EVT VT = N->getValueType(0);
33779 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33780 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33781 EVT SrcVT = Src.getValueType();
33782
33783 SDValue Res;
33784 if (isSoftF16(SrcVT, Subtarget)) {
33785 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
33786 if (IsStrict) {
33787 Res =
33788 DAG.getNode(Opc, dl, {VT, MVT::Other},
33789 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
33790 {NVT, MVT::Other}, {Chain, Src})});
33791 Chain = Res.getValue(1);
33792 } else {
33793 Res =
33794 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
33795 }
33796 Results.push_back(Res);
33797 if (IsStrict)
33798 Results.push_back(Chain);
33799
33800 return;
33801 }
33802
33803 if (VT.isVector() && Subtarget.hasFP16() &&
33804 SrcVT.getVectorElementType() == MVT::f16) {
33805 EVT EleVT = VT.getVectorElementType();
33806 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
33807
33808 if (SrcVT != MVT::v8f16) {
33809 SDValue Tmp =
33810 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
33811 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
33812 Ops[0] = Src;
33813 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
33814 }
33815
33816 if (IsStrict) {
33818 Res =
33819 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
33820 Chain = Res.getValue(1);
33821 } else {
33822 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33823 Res = DAG.getNode(Opc, dl, ResVT, Src);
33824 }
33825
33826 // TODO: Need to add exception check code for strict FP.
33827 if (EleVT.getSizeInBits() < 16) {
33828 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
33829 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
33830
33831 // Now widen to 128 bits.
33832 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
33833 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
33834 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
33835 ConcatOps[0] = Res;
33836 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33837 }
33838
33839 Results.push_back(Res);
33840 if (IsStrict)
33841 Results.push_back(Chain);
33842
33843 return;
33844 }
33845
33846 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
33848 "Unexpected type action!");
33849
33850 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
33851 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
33852 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
33854 SDValue Res;
33855 SDValue Chain;
33856 if (IsStrict) {
33857 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
33858 {N->getOperand(0), Src});
33859 Chain = Res.getValue(1);
33860 } else
33861 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
33862
33863 // Preserve what we know about the size of the original result. If the
33864 // result is v2i32, we have to manually widen the assert.
33865 if (PromoteVT == MVT::v2i32)
33866 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33867 DAG.getUNDEF(MVT::v2i32));
33868
33869 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
33870 Res.getValueType(), Res,
33872
33873 if (PromoteVT == MVT::v2i32)
33874 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
33875 DAG.getVectorIdxConstant(0, dl));
33876
33877 // Truncate back to the original width.
33878 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33879
33880 // Now widen to 128 bits.
33881 unsigned NumConcats = 128 / VT.getSizeInBits();
33883 VT.getVectorNumElements() * NumConcats);
33884 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33885 ConcatOps[0] = Res;
33886 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
33887 Results.push_back(Res);
33888 if (IsStrict)
33889 Results.push_back(Chain);
33890 return;
33891 }
33892
33893
33894 if (VT == MVT::v2i32) {
33895 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
33896 "Strict unsigned conversion requires AVX512");
33897 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33899 "Unexpected type action!");
33900 if (Src.getValueType() == MVT::v2f64) {
33901 if (!IsSigned && !Subtarget.hasAVX512()) {
33902 SDValue Res =
33903 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
33904 Results.push_back(Res);
33905 return;
33906 }
33907
33908 if (IsStrict)
33910 else
33911 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33912
33913 // If we have VLX we can emit a target specific FP_TO_UINT node,.
33914 if (!IsSigned && !Subtarget.hasVLX()) {
33915 // Otherwise we can defer to the generic legalizer which will widen
33916 // the input as well. This will be further widened during op
33917 // legalization to v8i32<-v8f64.
33918 // For strict nodes we'll need to widen ourselves.
33919 // FIXME: Fix the type legalizer to safely widen strict nodes?
33920 if (!IsStrict)
33921 return;
33922 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
33923 DAG.getConstantFP(0.0, dl, MVT::v2f64));
33924 Opc = N->getOpcode();
33925 }
33926 SDValue Res;
33927 SDValue Chain;
33928 if (IsStrict) {
33929 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33930 {N->getOperand(0), Src});
33931 Chain = Res.getValue(1);
33932 } else {
33933 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
33934 }
33935 Results.push_back(Res);
33936 if (IsStrict)
33937 Results.push_back(Chain);
33938 return;
33939 }
33940
33941 // Custom widen strict v2f32->v2i32 by padding with zeros.
33942 // FIXME: Should generic type legalizer do this?
33943 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
33944 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
33945 DAG.getConstantFP(0.0, dl, MVT::v2f32));
33946 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
33947 {N->getOperand(0), Src});
33948 Results.push_back(Res);
33949 Results.push_back(Res.getValue(1));
33950 return;
33951 }
33952
33953 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
33954 // so early out here.
33955 return;
33956 }
33957
33958 assert(!VT.isVector() && "Vectors should have been handled above!");
33959
33960 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
33961 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
33962 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
33963 assert(!Subtarget.is64Bit() && "i64 should be legal");
33964 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
33965 // If we use a 128-bit result we might need to use a target specific node.
33966 unsigned SrcElts =
33967 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
33968 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
33969 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
33970 if (NumElts != SrcElts) {
33971 if (IsStrict)
33973 else
33974 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
33975 }
33976
33977 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
33978 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
33979 DAG.getConstantFP(0.0, dl, VecInVT), Src,
33980 ZeroIdx);
33981 SDValue Chain;
33982 if (IsStrict) {
33983 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
33984 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
33985 Chain = Res.getValue(1);
33986 } else
33987 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
33988 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
33989 Results.push_back(Res);
33990 if (IsStrict)
33991 Results.push_back(Chain);
33992 return;
33993 }
33994
33995 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
33996 SDValue Chain;
33997 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
33998 Results.push_back(V);
33999 if (IsStrict)
34000 Results.push_back(Chain);
34001 return;
34002 }
34003
34004 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34005 Results.push_back(V);
34006 if (IsStrict)
34007 Results.push_back(Chain);
34008 }
34009 return;
34010 }
34011 case ISD::LRINT:
34012 case ISD::LLRINT: {
34013 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34014 Results.push_back(V);
34015 return;
34016 }
34017
34018 case ISD::SINT_TO_FP:
34020 case ISD::UINT_TO_FP:
34022 bool IsStrict = N->isStrictFPOpcode();
34023 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34024 EVT VT = N->getValueType(0);
34025 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34026 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34027 Subtarget.hasVLX()) {
34028 if (Src.getValueType().getVectorElementType() == MVT::i16)
34029 return;
34030
34031 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34032 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34033 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34034 : DAG.getUNDEF(MVT::v2i32));
34035 if (IsStrict) {
34036 unsigned Opc =
34038 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34039 {N->getOperand(0), Src});
34040 Results.push_back(Res);
34041 Results.push_back(Res.getValue(1));
34042 } else {
34043 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34044 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34045 }
34046 return;
34047 }
34048 if (VT != MVT::v2f32)
34049 return;
34050 EVT SrcVT = Src.getValueType();
34051 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34052 if (IsStrict) {
34053 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34055 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34056 {N->getOperand(0), Src});
34057 Results.push_back(Res);
34058 Results.push_back(Res.getValue(1));
34059 } else {
34060 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34061 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34062 }
34063 return;
34064 }
34065 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34066 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34067 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34068 SDValue One = DAG.getConstant(1, dl, SrcVT);
34069 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34070 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34071 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34072 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34073 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34074 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34075 for (int i = 0; i != 2; ++i) {
34076 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34077 SignSrc, DAG.getVectorIdxConstant(i, dl));
34078 if (IsStrict)
34079 SignCvts[i] =
34080 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34081 {N->getOperand(0), Elt});
34082 else
34083 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34084 };
34085 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34086 SDValue Slow, Chain;
34087 if (IsStrict) {
34088 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34089 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34090 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34091 {Chain, SignCvt, SignCvt});
34092 Chain = Slow.getValue(1);
34093 } else {
34094 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34095 }
34096 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34097 IsNeg =
34098 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34099 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34100 Results.push_back(Cvt);
34101 if (IsStrict)
34102 Results.push_back(Chain);
34103 return;
34104 }
34105
34106 if (SrcVT != MVT::v2i32)
34107 return;
34108
34109 if (IsSigned || Subtarget.hasAVX512()) {
34110 if (!IsStrict)
34111 return;
34112
34113 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34114 // FIXME: Should generic type legalizer do this?
34115 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34116 DAG.getConstant(0, dl, MVT::v2i32));
34117 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34118 {N->getOperand(0), Src});
34119 Results.push_back(Res);
34120 Results.push_back(Res.getValue(1));
34121 return;
34122 }
34123
34124 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34125 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34126 SDValue VBias = DAG.getConstantFP(
34127 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34128 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34129 DAG.getBitcast(MVT::v2i64, VBias));
34130 Or = DAG.getBitcast(MVT::v2f64, Or);
34131 if (IsStrict) {
34132 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34133 {N->getOperand(0), Or, VBias});
34135 {MVT::v4f32, MVT::Other},
34136 {Sub.getValue(1), Sub});
34137 Results.push_back(Res);
34138 Results.push_back(Res.getValue(1));
34139 } else {
34140 // TODO: Are there any fast-math-flags to propagate here?
34141 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34142 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34143 }
34144 return;
34145 }
34147 case ISD::FP_ROUND: {
34148 bool IsStrict = N->isStrictFPOpcode();
34149 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34150 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34151 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34152 EVT SrcVT = Src.getValueType();
34153 EVT VT = N->getValueType(0);
34154 SDValue V;
34155 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34156 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34157 : DAG.getUNDEF(MVT::v2f32);
34158 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34159 }
34160 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34161 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34162 if (SrcVT.getVectorElementType() != MVT::f32)
34163 return;
34164
34165 if (IsStrict)
34166 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34167 {Chain, Src, Rnd});
34168 else
34169 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34170
34171 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34172 if (IsStrict)
34173 Results.push_back(V.getValue(1));
34174 return;
34175 }
34176 if (!isTypeLegal(Src.getValueType()))
34177 return;
34178 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34179 if (IsStrict)
34180 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34181 {Chain, Src});
34182 else
34183 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34184 Results.push_back(V);
34185 if (IsStrict)
34186 Results.push_back(V.getValue(1));
34187 return;
34188 }
34189 case ISD::FP_EXTEND:
34190 case ISD::STRICT_FP_EXTEND: {
34191 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34192 // No other ValueType for FP_EXTEND should reach this point.
34193 assert(N->getValueType(0) == MVT::v2f32 &&
34194 "Do not know how to legalize this Node");
34195 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34196 return;
34197 bool IsStrict = N->isStrictFPOpcode();
34198 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34199 if (Src.getValueType().getVectorElementType() != MVT::f16)
34200 return;
34201 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34202 : DAG.getUNDEF(MVT::v2f16);
34203 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34204 if (IsStrict)
34205 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34206 {N->getOperand(0), V});
34207 else
34208 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34209 Results.push_back(V);
34210 if (IsStrict)
34211 Results.push_back(V.getValue(1));
34212 return;
34213 }
34215 unsigned IntNo = N->getConstantOperandVal(1);
34216 switch (IntNo) {
34217 default : llvm_unreachable("Do not know how to custom type "
34218 "legalize this intrinsic operation!");
34219 case Intrinsic::x86_rdtsc:
34220 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34221 Results);
34222 case Intrinsic::x86_rdtscp:
34223 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34224 Results);
34225 case Intrinsic::x86_rdpmc:
34226 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34227 Results);
34228 return;
34229 case Intrinsic::x86_rdpru:
34230 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34231 Results);
34232 return;
34233 case Intrinsic::x86_xgetbv:
34234 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34235 Results);
34236 return;
34237 }
34238 }
34239 case ISD::READCYCLECOUNTER: {
34240 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34241 }
34243 EVT T = N->getValueType(0);
34244 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34245 bool Regs64bit = T == MVT::i128;
34246 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34247 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34248 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34249 SDValue cpInL, cpInH;
34250 std::tie(cpInL, cpInH) =
34251 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34252 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34253 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34254 cpInH =
34255 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34256 cpInH, cpInL.getValue(1));
34257 SDValue swapInL, swapInH;
34258 std::tie(swapInL, swapInH) =
34259 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34260 swapInH =
34261 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34262 swapInH, cpInH.getValue(1));
34263
34264 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34265 // until later. So we keep the RBX input in a vreg and use a custom
34266 // inserter.
34267 // Since RBX will be a reserved register the register allocator will not
34268 // make sure its value will be properly saved and restored around this
34269 // live-range.
34270 SDValue Result;
34271 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34272 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34273 if (Regs64bit) {
34274 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34275 swapInH.getValue(1)};
34276 Result =
34277 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34278 } else {
34279 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34280 swapInH.getValue(1));
34281 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34282 swapInL.getValue(1)};
34283 Result =
34284 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34285 }
34286
34287 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34288 Regs64bit ? X86::RAX : X86::EAX,
34289 HalfT, Result.getValue(1));
34290 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34291 Regs64bit ? X86::RDX : X86::EDX,
34292 HalfT, cpOutL.getValue(2));
34293 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34294
34295 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34296 MVT::i32, cpOutH.getValue(2));
34297 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34298 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34299
34300 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34301 Results.push_back(Success);
34302 Results.push_back(EFLAGS.getValue(1));
34303 return;
34304 }
34305 case ISD::ATOMIC_LOAD: {
34306 assert(
34307 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34308 "Unexpected VT!");
34309 bool NoImplicitFloatOps =
34311 Attribute::NoImplicitFloat);
34312 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34313 auto *Node = cast<AtomicSDNode>(N);
34314
34315 if (N->getValueType(0) == MVT::i128) {
34316 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34317 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34318 Node->getBasePtr(), Node->getMemOperand());
34319 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34320 DAG.getVectorIdxConstant(0, dl));
34321 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34322 DAG.getVectorIdxConstant(1, dl));
34323 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34324 {ResL, ResH}));
34325 Results.push_back(Ld.getValue(1));
34326 return;
34327 }
34328 break;
34329 }
34330 if (Subtarget.hasSSE1()) {
34331 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34332 // Then extract the lower 64-bits.
34333 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34334 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34335 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34336 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34337 MVT::i64, Node->getMemOperand());
34338 if (Subtarget.hasSSE2()) {
34339 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34340 DAG.getVectorIdxConstant(0, dl));
34341 Results.push_back(Res);
34342 Results.push_back(Ld.getValue(1));
34343 return;
34344 }
34345 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34346 // then casts to i64. This avoids a 128-bit stack temporary being
34347 // created by type legalization if we were to cast v4f32->v2i64.
34348 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34349 DAG.getVectorIdxConstant(0, dl));
34350 Res = DAG.getBitcast(MVT::i64, Res);
34351 Results.push_back(Res);
34352 Results.push_back(Ld.getValue(1));
34353 return;
34354 }
34355 if (Subtarget.hasX87()) {
34356 // First load this into an 80-bit X87 register. This will put the whole
34357 // integer into the significand.
34358 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34359 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34361 dl, Tys, Ops, MVT::i64,
34362 Node->getMemOperand());
34363 SDValue Chain = Result.getValue(1);
34364
34365 // Now store the X87 register to a stack temporary and convert to i64.
34366 // This store is not atomic and doesn't need to be.
34367 // FIXME: We don't need a stack temporary if the result of the load
34368 // is already being stored. We could just directly store there.
34369 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34370 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34371 MachinePointerInfo MPI =
34373 SDValue StoreOps[] = { Chain, Result, StackPtr };
34374 Chain = DAG.getMemIntrinsicNode(
34375 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34376 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34377
34378 // Finally load the value back from the stack temporary and return it.
34379 // This load is not atomic and doesn't need to be.
34380 // This load will be further type legalized.
34381 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34382 Results.push_back(Result);
34383 Results.push_back(Result.getValue(1));
34384 return;
34385 }
34386 }
34387 // TODO: Use MOVLPS when SSE1 is available?
34388 // Delegate to generic TypeLegalization. Situations we can really handle
34389 // should have already been dealt with by AtomicExpandPass.cpp.
34390 break;
34391 }
34392 case ISD::ATOMIC_SWAP:
34403 // Delegate to generic TypeLegalization. Situations we can really handle
34404 // should have already been dealt with by AtomicExpandPass.cpp.
34405 break;
34406
34407 case ISD::BITCAST: {
34408 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34409 EVT DstVT = N->getValueType(0);
34410 EVT SrcVT = N->getOperand(0).getValueType();
34411
34412 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34413 // we can split using the k-register rather than memory.
34414 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34415 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34416 SDValue Lo, Hi;
34417 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34418 Lo = DAG.getBitcast(MVT::i32, Lo);
34419 Hi = DAG.getBitcast(MVT::i32, Hi);
34420 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34421 Results.push_back(Res);
34422 return;
34423 }
34424
34425 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34426 // FIXME: Use v4f32 for SSE1?
34427 assert(Subtarget.hasSSE2() && "Requires SSE2");
34428 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34429 "Unexpected type action!");
34430 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34431 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34432 N->getOperand(0));
34433 Res = DAG.getBitcast(WideVT, Res);
34434 Results.push_back(Res);
34435 return;
34436 }
34437
34438 return;
34439 }
34440 case ISD::MGATHER: {
34441 EVT VT = N->getValueType(0);
34442 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34443 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34444 auto *Gather = cast<MaskedGatherSDNode>(N);
34445 SDValue Index = Gather->getIndex();
34446 if (Index.getValueType() != MVT::v2i64)
34447 return;
34449 "Unexpected type action!");
34450 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34451 SDValue Mask = Gather->getMask();
34452 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34453 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34454 Gather->getPassThru(),
34455 DAG.getUNDEF(VT));
34456 if (!Subtarget.hasVLX()) {
34457 // We need to widen the mask, but the instruction will only use 2
34458 // of its elements. So we can use undef.
34459 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34460 DAG.getUNDEF(MVT::v2i1));
34461 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34462 }
34463 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34464 Gather->getBasePtr(), Index, Gather->getScale() };
34465 SDValue Res = DAG.getMemIntrinsicNode(
34466 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34467 Gather->getMemoryVT(), Gather->getMemOperand());
34468 Results.push_back(Res);
34469 Results.push_back(Res.getValue(1));
34470 return;
34471 }
34472 return;
34473 }
34474 case ISD::LOAD: {
34475 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34476 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34477 // cast since type legalization will try to use an i64 load.
34478 MVT VT = N->getSimpleValueType(0);
34479 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34481 "Unexpected type action!");
34482 if (!ISD::isNON_EXTLoad(N))
34483 return;
34484 auto *Ld = cast<LoadSDNode>(N);
34485 if (Subtarget.hasSSE2()) {
34486 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34487 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34488 Ld->getPointerInfo(), Ld->getOriginalAlign(),
34489 Ld->getMemOperand()->getFlags());
34490 SDValue Chain = Res.getValue(1);
34491 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34492 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34493 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34494 Res = DAG.getBitcast(WideVT, Res);
34495 Results.push_back(Res);
34496 Results.push_back(Chain);
34497 return;
34498 }
34499 assert(Subtarget.hasSSE1() && "Expected SSE");
34500 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34501 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34502 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
34503 MVT::i64, Ld->getMemOperand());
34504 Results.push_back(Res);
34505 Results.push_back(Res.getValue(1));
34506 return;
34507 }
34508 case ISD::ADDRSPACECAST: {
34509 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34510 Results.push_back(V);
34511 return;
34512 }
34513 case ISD::BITREVERSE: {
34514 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34515 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34516 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34517 // We'll need to move the scalar in two i32 pieces.
34518 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34519 return;
34520 }
34522 // f16 = extract vXf16 %vec, i64 %idx
34523 assert(N->getSimpleValueType(0) == MVT::f16 &&
34524 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34525 assert(Subtarget.hasFP16() && "Expected FP16");
34526 SDValue VecOp = N->getOperand(0);
34528 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34529 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34530 N->getOperand(1));
34531 Split = DAG.getBitcast(MVT::f16, Split);
34532 Results.push_back(Split);
34533 return;
34534 }
34535 }
34536}
34537
34538const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34539 switch ((X86ISD::NodeType)Opcode) {
34540 case X86ISD::FIRST_NUMBER: break;
34541#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34542 NODE_NAME_CASE(BSF)
34543 NODE_NAME_CASE(BSR)
34544 NODE_NAME_CASE(FSHL)
34545 NODE_NAME_CASE(FSHR)
34546 NODE_NAME_CASE(FAND)
34547 NODE_NAME_CASE(FANDN)
34548 NODE_NAME_CASE(FOR)
34549 NODE_NAME_CASE(FXOR)
34550 NODE_NAME_CASE(FILD)
34551 NODE_NAME_CASE(FIST)
34552 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34553 NODE_NAME_CASE(FLD)
34554 NODE_NAME_CASE(FST)
34555 NODE_NAME_CASE(CALL)
34556 NODE_NAME_CASE(CALL_RVMARKER)
34558 NODE_NAME_CASE(CMP)
34559 NODE_NAME_CASE(FCMP)
34560 NODE_NAME_CASE(STRICT_FCMP)
34561 NODE_NAME_CASE(STRICT_FCMPS)
34563 NODE_NAME_CASE(UCOMI)
34564 NODE_NAME_CASE(COMX)
34565 NODE_NAME_CASE(UCOMX)
34566 NODE_NAME_CASE(CMPM)
34567 NODE_NAME_CASE(CMPMM)
34568 NODE_NAME_CASE(STRICT_CMPM)
34569 NODE_NAME_CASE(CMPMM_SAE)
34570 NODE_NAME_CASE(SETCC)
34571 NODE_NAME_CASE(SETCC_CARRY)
34572 NODE_NAME_CASE(FSETCC)
34573 NODE_NAME_CASE(FSETCCM)
34574 NODE_NAME_CASE(FSETCCM_SAE)
34575 NODE_NAME_CASE(CMOV)
34576 NODE_NAME_CASE(BRCOND)
34577 NODE_NAME_CASE(RET_GLUE)
34578 NODE_NAME_CASE(IRET)
34579 NODE_NAME_CASE(REP_STOS)
34580 NODE_NAME_CASE(REP_MOVS)
34581 NODE_NAME_CASE(GlobalBaseReg)
34583 NODE_NAME_CASE(WrapperRIP)
34584 NODE_NAME_CASE(MOVQ2DQ)
34585 NODE_NAME_CASE(MOVDQ2Q)
34586 NODE_NAME_CASE(MMX_MOVD2W)
34587 NODE_NAME_CASE(MMX_MOVW2D)
34588 NODE_NAME_CASE(PEXTRB)
34589 NODE_NAME_CASE(PEXTRW)
34590 NODE_NAME_CASE(INSERTPS)
34591 NODE_NAME_CASE(PINSRB)
34592 NODE_NAME_CASE(PINSRW)
34593 NODE_NAME_CASE(PSHUFB)
34594 NODE_NAME_CASE(ANDNP)
34595 NODE_NAME_CASE(BLENDI)
34597 NODE_NAME_CASE(HADD)
34598 NODE_NAME_CASE(HSUB)
34599 NODE_NAME_CASE(FHADD)
34600 NODE_NAME_CASE(FHSUB)
34601 NODE_NAME_CASE(CONFLICT)
34602 NODE_NAME_CASE(FMAX)
34603 NODE_NAME_CASE(FMAXS)
34604 NODE_NAME_CASE(FMAX_SAE)
34605 NODE_NAME_CASE(FMAXS_SAE)
34606 NODE_NAME_CASE(STRICT_FMAX)
34607 NODE_NAME_CASE(FMIN)
34608 NODE_NAME_CASE(FMINS)
34609 NODE_NAME_CASE(FMIN_SAE)
34610 NODE_NAME_CASE(FMINS_SAE)
34611 NODE_NAME_CASE(STRICT_FMIN)
34612 NODE_NAME_CASE(FMAXC)
34613 NODE_NAME_CASE(FMINC)
34614 NODE_NAME_CASE(FRSQRT)
34615 NODE_NAME_CASE(FRCP)
34616 NODE_NAME_CASE(EXTRQI)
34617 NODE_NAME_CASE(INSERTQI)
34618 NODE_NAME_CASE(TLSADDR)
34619 NODE_NAME_CASE(TLSBASEADDR)
34620 NODE_NAME_CASE(TLSCALL)
34621 NODE_NAME_CASE(TLSDESC)
34622 NODE_NAME_CASE(EH_SJLJ_SETJMP)
34623 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
34624 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
34625 NODE_NAME_CASE(EH_RETURN)
34626 NODE_NAME_CASE(TC_RETURN)
34627 NODE_NAME_CASE(FNSTCW16m)
34628 NODE_NAME_CASE(FLDCW16m)
34629 NODE_NAME_CASE(FNSTENVm)
34630 NODE_NAME_CASE(FLDENVm)
34631 NODE_NAME_CASE(LCMPXCHG_DAG)
34632 NODE_NAME_CASE(LCMPXCHG8_DAG)
34633 NODE_NAME_CASE(LCMPXCHG16_DAG)
34634 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
34635 NODE_NAME_CASE(LADD)
34636 NODE_NAME_CASE(LSUB)
34637 NODE_NAME_CASE(LOR)
34638 NODE_NAME_CASE(LXOR)
34639 NODE_NAME_CASE(LAND)
34640 NODE_NAME_CASE(LBTS)
34641 NODE_NAME_CASE(LBTC)
34642 NODE_NAME_CASE(LBTR)
34643 NODE_NAME_CASE(LBTS_RM)
34644 NODE_NAME_CASE(LBTC_RM)
34645 NODE_NAME_CASE(LBTR_RM)
34646 NODE_NAME_CASE(AADD)
34647 NODE_NAME_CASE(AOR)
34648 NODE_NAME_CASE(AXOR)
34649 NODE_NAME_CASE(AAND)
34650 NODE_NAME_CASE(VZEXT_MOVL)
34651 NODE_NAME_CASE(VZEXT_LOAD)
34652 NODE_NAME_CASE(VEXTRACT_STORE)
34653 NODE_NAME_CASE(VTRUNC)
34654 NODE_NAME_CASE(VTRUNCS)
34655 NODE_NAME_CASE(VTRUNCUS)
34656 NODE_NAME_CASE(VMTRUNC)
34657 NODE_NAME_CASE(VMTRUNCS)
34658 NODE_NAME_CASE(VMTRUNCUS)
34659 NODE_NAME_CASE(VTRUNCSTORES)
34660 NODE_NAME_CASE(VTRUNCSTOREUS)
34661 NODE_NAME_CASE(VMTRUNCSTORES)
34662 NODE_NAME_CASE(VMTRUNCSTOREUS)
34663 NODE_NAME_CASE(VFPEXT)
34664 NODE_NAME_CASE(STRICT_VFPEXT)
34665 NODE_NAME_CASE(VFPEXT_SAE)
34666 NODE_NAME_CASE(VFPEXTS)
34667 NODE_NAME_CASE(VFPEXTS_SAE)
34668 NODE_NAME_CASE(VFPROUND)
34669 NODE_NAME_CASE(VFPROUND2)
34670 NODE_NAME_CASE(VFPROUND2_RND)
34671 NODE_NAME_CASE(STRICT_VFPROUND)
34672 NODE_NAME_CASE(VMFPROUND)
34673 NODE_NAME_CASE(VFPROUND_RND)
34674 NODE_NAME_CASE(VFPROUNDS)
34675 NODE_NAME_CASE(VFPROUNDS_RND)
34676 NODE_NAME_CASE(VSHLDQ)
34677 NODE_NAME_CASE(VSRLDQ)
34678 NODE_NAME_CASE(VSHL)
34679 NODE_NAME_CASE(VSRL)
34680 NODE_NAME_CASE(VSRA)
34681 NODE_NAME_CASE(VSHLI)
34682 NODE_NAME_CASE(VSRLI)
34683 NODE_NAME_CASE(VSRAI)
34684 NODE_NAME_CASE(VSHLV)
34685 NODE_NAME_CASE(VSRLV)
34686 NODE_NAME_CASE(VSRAV)
34687 NODE_NAME_CASE(VROTLI)
34688 NODE_NAME_CASE(VROTRI)
34689 NODE_NAME_CASE(VPPERM)
34690 NODE_NAME_CASE(CMPP)
34691 NODE_NAME_CASE(STRICT_CMPP)
34692 NODE_NAME_CASE(PCMPEQ)
34693 NODE_NAME_CASE(PCMPGT)
34694 NODE_NAME_CASE(PHMINPOS)
34695 NODE_NAME_CASE(ADD)
34696 NODE_NAME_CASE(SUB)
34697 NODE_NAME_CASE(ADC)
34698 NODE_NAME_CASE(SBB)
34699 NODE_NAME_CASE(SMUL)
34700 NODE_NAME_CASE(UMUL)
34701 NODE_NAME_CASE(OR)
34702 NODE_NAME_CASE(XOR)
34703 NODE_NAME_CASE(AND)
34704 NODE_NAME_CASE(BEXTR)
34706 NODE_NAME_CASE(BZHI)
34707 NODE_NAME_CASE(PDEP)
34708 NODE_NAME_CASE(PEXT)
34709 NODE_NAME_CASE(MUL_IMM)
34710 NODE_NAME_CASE(MOVMSK)
34711 NODE_NAME_CASE(PTEST)
34712 NODE_NAME_CASE(TESTP)
34713 NODE_NAME_CASE(KORTEST)
34714 NODE_NAME_CASE(KTEST)
34715 NODE_NAME_CASE(KADD)
34716 NODE_NAME_CASE(KSHIFTL)
34717 NODE_NAME_CASE(KSHIFTR)
34718 NODE_NAME_CASE(PACKSS)
34719 NODE_NAME_CASE(PACKUS)
34720 NODE_NAME_CASE(PALIGNR)
34721 NODE_NAME_CASE(VALIGN)
34722 NODE_NAME_CASE(VSHLD)
34723 NODE_NAME_CASE(VSHRD)
34724 NODE_NAME_CASE(VSHLDV)
34725 NODE_NAME_CASE(VSHRDV)
34726 NODE_NAME_CASE(PSHUFD)
34727 NODE_NAME_CASE(PSHUFHW)
34728 NODE_NAME_CASE(PSHUFLW)
34729 NODE_NAME_CASE(SHUFP)
34730 NODE_NAME_CASE(SHUF128)
34731 NODE_NAME_CASE(MOVLHPS)
34732 NODE_NAME_CASE(MOVHLPS)
34733 NODE_NAME_CASE(MOVDDUP)
34734 NODE_NAME_CASE(MOVSHDUP)
34735 NODE_NAME_CASE(MOVSLDUP)
34736 NODE_NAME_CASE(MOVSD)
34737 NODE_NAME_CASE(MOVSS)
34738 NODE_NAME_CASE(MOVSH)
34739 NODE_NAME_CASE(UNPCKL)
34740 NODE_NAME_CASE(UNPCKH)
34741 NODE_NAME_CASE(VBROADCAST)
34742 NODE_NAME_CASE(VBROADCAST_LOAD)
34743 NODE_NAME_CASE(VBROADCASTM)
34744 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
34745 NODE_NAME_CASE(VPERMILPV)
34746 NODE_NAME_CASE(VPERMILPI)
34747 NODE_NAME_CASE(VPERM2X128)
34748 NODE_NAME_CASE(VPERMV)
34749 NODE_NAME_CASE(VPERMV3)
34750 NODE_NAME_CASE(VPERMI)
34751 NODE_NAME_CASE(VPTERNLOG)
34752 NODE_NAME_CASE(FP_TO_SINT_SAT)
34753 NODE_NAME_CASE(FP_TO_UINT_SAT)
34754 NODE_NAME_CASE(VFIXUPIMM)
34755 NODE_NAME_CASE(VFIXUPIMM_SAE)
34756 NODE_NAME_CASE(VFIXUPIMMS)
34757 NODE_NAME_CASE(VFIXUPIMMS_SAE)
34758 NODE_NAME_CASE(VRANGE)
34759 NODE_NAME_CASE(VRANGE_SAE)
34760 NODE_NAME_CASE(VRANGES)
34761 NODE_NAME_CASE(VRANGES_SAE)
34762 NODE_NAME_CASE(PMULUDQ)
34763 NODE_NAME_CASE(PMULDQ)
34764 NODE_NAME_CASE(PSADBW)
34765 NODE_NAME_CASE(DBPSADBW)
34766 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
34767 NODE_NAME_CASE(VAARG_64)
34768 NODE_NAME_CASE(VAARG_X32)
34769 NODE_NAME_CASE(DYN_ALLOCA)
34770 NODE_NAME_CASE(MFENCE)
34771 NODE_NAME_CASE(SEG_ALLOCA)
34772 NODE_NAME_CASE(PROBED_ALLOCA)
34775 NODE_NAME_CASE(RDPKRU)
34776 NODE_NAME_CASE(WRPKRU)
34777 NODE_NAME_CASE(VPMADDUBSW)
34778 NODE_NAME_CASE(VPMADDWD)
34779 NODE_NAME_CASE(VPSHA)
34780 NODE_NAME_CASE(VPSHL)
34781 NODE_NAME_CASE(VPCOM)
34782 NODE_NAME_CASE(VPCOMU)
34783 NODE_NAME_CASE(VPERMIL2)
34785 NODE_NAME_CASE(STRICT_FMSUB)
34787 NODE_NAME_CASE(STRICT_FNMADD)
34789 NODE_NAME_CASE(STRICT_FNMSUB)
34790 NODE_NAME_CASE(FMADDSUB)
34791 NODE_NAME_CASE(FMSUBADD)
34792 NODE_NAME_CASE(FMADD_RND)
34793 NODE_NAME_CASE(FNMADD_RND)
34794 NODE_NAME_CASE(FMSUB_RND)
34795 NODE_NAME_CASE(FNMSUB_RND)
34796 NODE_NAME_CASE(FMADDSUB_RND)
34797 NODE_NAME_CASE(FMSUBADD_RND)
34798 NODE_NAME_CASE(VFMADDC)
34799 NODE_NAME_CASE(VFMADDC_RND)
34800 NODE_NAME_CASE(VFCMADDC)
34801 NODE_NAME_CASE(VFCMADDC_RND)
34802 NODE_NAME_CASE(VFMULC)
34803 NODE_NAME_CASE(VFMULC_RND)
34804 NODE_NAME_CASE(VFCMULC)
34805 NODE_NAME_CASE(VFCMULC_RND)
34806 NODE_NAME_CASE(VFMULCSH)
34807 NODE_NAME_CASE(VFMULCSH_RND)
34808 NODE_NAME_CASE(VFCMULCSH)
34809 NODE_NAME_CASE(VFCMULCSH_RND)
34810 NODE_NAME_CASE(VFMADDCSH)
34811 NODE_NAME_CASE(VFMADDCSH_RND)
34812 NODE_NAME_CASE(VFCMADDCSH)
34813 NODE_NAME_CASE(VFCMADDCSH_RND)
34814 NODE_NAME_CASE(VPMADD52H)
34815 NODE_NAME_CASE(VPMADD52L)
34816 NODE_NAME_CASE(VRNDSCALE)
34817 NODE_NAME_CASE(STRICT_VRNDSCALE)
34818 NODE_NAME_CASE(VRNDSCALE_SAE)
34819 NODE_NAME_CASE(VRNDSCALES)
34820 NODE_NAME_CASE(VRNDSCALES_SAE)
34821 NODE_NAME_CASE(VREDUCE)
34822 NODE_NAME_CASE(VREDUCE_SAE)
34823 NODE_NAME_CASE(VREDUCES)
34824 NODE_NAME_CASE(VREDUCES_SAE)
34825 NODE_NAME_CASE(VGETMANT)
34826 NODE_NAME_CASE(VGETMANT_SAE)
34827 NODE_NAME_CASE(VGETMANTS)
34828 NODE_NAME_CASE(VGETMANTS_SAE)
34829 NODE_NAME_CASE(PCMPESTR)
34830 NODE_NAME_CASE(PCMPISTR)
34832 NODE_NAME_CASE(COMPRESS)
34834 NODE_NAME_CASE(SELECTS)
34835 NODE_NAME_CASE(ADDSUB)
34836 NODE_NAME_CASE(RCP14)
34837 NODE_NAME_CASE(RCP14S)
34838 NODE_NAME_CASE(RSQRT14)
34839 NODE_NAME_CASE(RSQRT14S)
34840 NODE_NAME_CASE(FADD_RND)
34841 NODE_NAME_CASE(FADDS)
34842 NODE_NAME_CASE(FADDS_RND)
34843 NODE_NAME_CASE(FSUB_RND)
34844 NODE_NAME_CASE(FSUBS)
34845 NODE_NAME_CASE(FSUBS_RND)
34846 NODE_NAME_CASE(FMUL_RND)
34847 NODE_NAME_CASE(FMULS)
34848 NODE_NAME_CASE(FMULS_RND)
34849 NODE_NAME_CASE(FDIV_RND)
34850 NODE_NAME_CASE(FDIVS)
34851 NODE_NAME_CASE(FDIVS_RND)
34852 NODE_NAME_CASE(FSQRT_RND)
34853 NODE_NAME_CASE(FSQRTS)
34854 NODE_NAME_CASE(FSQRTS_RND)
34855 NODE_NAME_CASE(FGETEXP)
34856 NODE_NAME_CASE(FGETEXP_SAE)
34857 NODE_NAME_CASE(FGETEXPS)
34858 NODE_NAME_CASE(FGETEXPS_SAE)
34859 NODE_NAME_CASE(SCALEF)
34860 NODE_NAME_CASE(SCALEF_RND)
34861 NODE_NAME_CASE(SCALEFS)
34862 NODE_NAME_CASE(SCALEFS_RND)
34863 NODE_NAME_CASE(MULHRS)
34864 NODE_NAME_CASE(SINT_TO_FP_RND)
34865 NODE_NAME_CASE(UINT_TO_FP_RND)
34866 NODE_NAME_CASE(CVTTP2SI)
34867 NODE_NAME_CASE(CVTTP2UI)
34868 NODE_NAME_CASE(STRICT_CVTTP2SI)
34869 NODE_NAME_CASE(STRICT_CVTTP2UI)
34870 NODE_NAME_CASE(MCVTTP2SI)
34871 NODE_NAME_CASE(MCVTTP2UI)
34872 NODE_NAME_CASE(CVTTP2SI_SAE)
34873 NODE_NAME_CASE(CVTTP2UI_SAE)
34874 NODE_NAME_CASE(CVTTS2SI)
34875 NODE_NAME_CASE(CVTTS2UI)
34876 NODE_NAME_CASE(CVTTS2SI_SAE)
34877 NODE_NAME_CASE(CVTTS2UI_SAE)
34878 NODE_NAME_CASE(CVTSI2P)
34879 NODE_NAME_CASE(CVTUI2P)
34880 NODE_NAME_CASE(STRICT_CVTSI2P)
34881 NODE_NAME_CASE(STRICT_CVTUI2P)
34882 NODE_NAME_CASE(MCVTSI2P)
34883 NODE_NAME_CASE(MCVTUI2P)
34884 NODE_NAME_CASE(VFPCLASS)
34885 NODE_NAME_CASE(VFPCLASSS)
34886 NODE_NAME_CASE(MULTISHIFT)
34887 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
34888 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
34889 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
34890 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
34891 NODE_NAME_CASE(CVTPS2PH)
34892 NODE_NAME_CASE(STRICT_CVTPS2PH)
34893 NODE_NAME_CASE(CVTPS2PH_SAE)
34894 NODE_NAME_CASE(MCVTPS2PH)
34895 NODE_NAME_CASE(MCVTPS2PH_SAE)
34896 NODE_NAME_CASE(CVTPH2PS)
34897 NODE_NAME_CASE(STRICT_CVTPH2PS)
34898 NODE_NAME_CASE(CVTPH2PS_SAE)
34899 NODE_NAME_CASE(CVTP2SI)
34900 NODE_NAME_CASE(CVTP2UI)
34901 NODE_NAME_CASE(MCVTP2SI)
34902 NODE_NAME_CASE(MCVTP2UI)
34903 NODE_NAME_CASE(CVTP2SI_RND)
34904 NODE_NAME_CASE(CVTP2UI_RND)
34905 NODE_NAME_CASE(CVTS2SI)
34906 NODE_NAME_CASE(CVTS2UI)
34907 NODE_NAME_CASE(CVTS2SI_RND)
34908 NODE_NAME_CASE(CVTS2UI_RND)
34909 NODE_NAME_CASE(CVTNEPS2BF16)
34910 NODE_NAME_CASE(MCVTNEPS2BF16)
34911 NODE_NAME_CASE(DPBF16PS)
34912 NODE_NAME_CASE(DPFP16PS)
34913 NODE_NAME_CASE(MPSADBW)
34914 NODE_NAME_CASE(LWPINS)
34915 NODE_NAME_CASE(MGATHER)
34916 NODE_NAME_CASE(MSCATTER)
34917 NODE_NAME_CASE(VPDPBUSD)
34918 NODE_NAME_CASE(VPDPBUSDS)
34919 NODE_NAME_CASE(VPDPWSSD)
34920 NODE_NAME_CASE(VPDPWSSDS)
34921 NODE_NAME_CASE(VPSHUFBITQMB)
34922 NODE_NAME_CASE(GF2P8MULB)
34923 NODE_NAME_CASE(GF2P8AFFINEQB)
34924 NODE_NAME_CASE(GF2P8AFFINEINVQB)
34925 NODE_NAME_CASE(NT_CALL)
34926 NODE_NAME_CASE(NT_BRIND)
34927 NODE_NAME_CASE(UMWAIT)
34928 NODE_NAME_CASE(TPAUSE)
34929 NODE_NAME_CASE(ENQCMD)
34930 NODE_NAME_CASE(ENQCMDS)
34931 NODE_NAME_CASE(VP2INTERSECT)
34932 NODE_NAME_CASE(VPDPBSUD)
34933 NODE_NAME_CASE(VPDPBSUDS)
34934 NODE_NAME_CASE(VPDPBUUD)
34935 NODE_NAME_CASE(VPDPBUUDS)
34936 NODE_NAME_CASE(VPDPBSSD)
34937 NODE_NAME_CASE(VPDPBSSDS)
34938 NODE_NAME_CASE(VPDPWSUD)
34939 NODE_NAME_CASE(VPDPWSUDS)
34940 NODE_NAME_CASE(VPDPWUSD)
34941 NODE_NAME_CASE(VPDPWUSDS)
34942 NODE_NAME_CASE(VPDPWUUD)
34943 NODE_NAME_CASE(VPDPWUUDS)
34944 NODE_NAME_CASE(VMINMAX)
34945 NODE_NAME_CASE(VMINMAX_SAE)
34946 NODE_NAME_CASE(VMINMAXS)
34947 NODE_NAME_CASE(VMINMAXS_SAE)
34948 NODE_NAME_CASE(CVTP2IBS)
34949 NODE_NAME_CASE(CVTP2IUBS)
34950 NODE_NAME_CASE(CVTP2IBS_RND)
34951 NODE_NAME_CASE(CVTP2IUBS_RND)
34952 NODE_NAME_CASE(CVTTP2IBS)
34953 NODE_NAME_CASE(CVTTP2IUBS)
34954 NODE_NAME_CASE(CVTTP2IBS_SAE)
34955 NODE_NAME_CASE(CVTTP2IUBS_SAE)
34956 NODE_NAME_CASE(VCVT2PH2BF8)
34957 NODE_NAME_CASE(VCVT2PH2BF8S)
34958 NODE_NAME_CASE(VCVT2PH2HF8)
34959 NODE_NAME_CASE(VCVT2PH2HF8S)
34960 NODE_NAME_CASE(VCVTBIASPH2BF8)
34961 NODE_NAME_CASE(VCVTBIASPH2BF8S)
34962 NODE_NAME_CASE(VCVTBIASPH2HF8)
34963 NODE_NAME_CASE(VCVTBIASPH2HF8S)
34964 NODE_NAME_CASE(VCVTPH2BF8)
34965 NODE_NAME_CASE(VCVTPH2BF8S)
34966 NODE_NAME_CASE(VCVTPH2HF8)
34967 NODE_NAME_CASE(VCVTPH2HF8S)
34968 NODE_NAME_CASE(VMCVTBIASPH2BF8)
34969 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
34970 NODE_NAME_CASE(VMCVTBIASPH2HF8)
34971 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
34972 NODE_NAME_CASE(VMCVTPH2BF8)
34973 NODE_NAME_CASE(VMCVTPH2BF8S)
34974 NODE_NAME_CASE(VMCVTPH2HF8)
34975 NODE_NAME_CASE(VMCVTPH2HF8S)
34976 NODE_NAME_CASE(VCVTHF82PH)
34977 NODE_NAME_CASE(AESENC128KL)
34978 NODE_NAME_CASE(AESDEC128KL)
34979 NODE_NAME_CASE(AESENC256KL)
34980 NODE_NAME_CASE(AESDEC256KL)
34981 NODE_NAME_CASE(AESENCWIDE128KL)
34982 NODE_NAME_CASE(AESDECWIDE128KL)
34983 NODE_NAME_CASE(AESENCWIDE256KL)
34984 NODE_NAME_CASE(AESDECWIDE256KL)
34985 NODE_NAME_CASE(CMPCCXADD)
34986 NODE_NAME_CASE(TESTUI)
34987 NODE_NAME_CASE(FP80_ADD)
34988 NODE_NAME_CASE(STRICT_FP80_ADD)
34989 NODE_NAME_CASE(CCMP)
34990 NODE_NAME_CASE(CTEST)
34991 NODE_NAME_CASE(CLOAD)
34992 NODE_NAME_CASE(CSTORE)
34993 NODE_NAME_CASE(CVTTS2SIS)
34994 NODE_NAME_CASE(CVTTS2UIS)
34995 NODE_NAME_CASE(CVTTS2SIS_SAE)
34996 NODE_NAME_CASE(CVTTS2UIS_SAE)
34997 NODE_NAME_CASE(CVTTP2SIS)
34998 NODE_NAME_CASE(MCVTTP2SIS)
34999 NODE_NAME_CASE(CVTTP2UIS_SAE)
35000 NODE_NAME_CASE(CVTTP2SIS_SAE)
35001 NODE_NAME_CASE(CVTTP2UIS)
35002 NODE_NAME_CASE(MCVTTP2UIS)
35003 }
35004 return nullptr;
35005#undef NODE_NAME_CASE
35006}
35007
35008/// Return true if the addressing mode represented by AM is legal for this
35009/// target, for a load/store of the specified type.
35011 const AddrMode &AM, Type *Ty,
35012 unsigned AS,
35013 Instruction *I) const {
35014 // X86 supports extremely general addressing modes.
35016
35017 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35018 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35019 return false;
35020
35021 if (AM.BaseGV) {
35022 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35023
35024 // If a reference to this global requires an extra load, we can't fold it.
35025 if (isGlobalStubReference(GVFlags))
35026 return false;
35027
35028 // If BaseGV requires a register for the PIC base, we cannot also have a
35029 // BaseReg specified.
35030 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35031 return false;
35032
35033 // If lower 4G is not available, then we must use rip-relative addressing.
35034 if ((M != CodeModel::Small || isPositionIndependent()) &&
35035 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35036 return false;
35037 }
35038
35039 switch (AM.Scale) {
35040 case 0:
35041 case 1:
35042 case 2:
35043 case 4:
35044 case 8:
35045 // These scales always work.
35046 break;
35047 case 3:
35048 case 5:
35049 case 9:
35050 // These scales are formed with basereg+scalereg. Only accept if there is
35051 // no basereg yet.
35052 if (AM.HasBaseReg)
35053 return false;
35054 break;
35055 default: // Other stuff never works.
35056 return false;
35057 }
35058
35059 return true;
35060}
35061
35062bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35063 switch (Opcode) {
35064 // These are non-commutative binops.
35065 // TODO: Add more X86ISD opcodes once we have test coverage.
35066 case X86ISD::ANDNP:
35067 case X86ISD::PCMPGT:
35068 case X86ISD::FMAX:
35069 case X86ISD::FMIN:
35070 case X86ISD::FANDN:
35071 case X86ISD::VPSHA:
35072 case X86ISD::VPSHL:
35073 case X86ISD::VSHLV:
35074 case X86ISD::VSRLV:
35075 case X86ISD::VSRAV:
35076 return true;
35077 }
35078
35079 return TargetLoweringBase::isBinOp(Opcode);
35080}
35081
35082bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35083 switch (Opcode) {
35084 // TODO: Add more X86ISD opcodes once we have test coverage.
35085 case X86ISD::PCMPEQ:
35086 case X86ISD::PMULDQ:
35087 case X86ISD::PMULUDQ:
35088 case X86ISD::FMAXC:
35089 case X86ISD::FMINC:
35090 case X86ISD::FAND:
35091 case X86ISD::FOR:
35092 case X86ISD::FXOR:
35093 return true;
35094 }
35095
35097}
35098
35100 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35101 return false;
35102 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35103 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35104 return NumBits1 > NumBits2;
35105}
35106
35108 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35109 return false;
35110
35111 if (!isTypeLegal(EVT::getEVT(Ty1)))
35112 return false;
35113
35114 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35115
35116 // Assuming the caller doesn't have a zeroext or signext return parameter,
35117 // truncation all the way down to i1 is valid.
35118 return true;
35119}
35120
35122 return isInt<32>(Imm);
35123}
35124
35126 // Can also use sub to handle negated immediates.
35127 return isInt<32>(Imm);
35128}
35129
35131 return isInt<32>(Imm);
35132}
35133
35135 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35136 return false;
35137 unsigned NumBits1 = VT1.getSizeInBits();
35138 unsigned NumBits2 = VT2.getSizeInBits();
35139 return NumBits1 > NumBits2;
35140}
35141
35143 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35144 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35145}
35146
35148 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35149 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35150}
35151
35153 EVT VT1 = Val.getValueType();
35154 if (isZExtFree(VT1, VT2))
35155 return true;
35156
35157 if (Val.getOpcode() != ISD::LOAD)
35158 return false;
35159
35160 if (!VT1.isSimple() || !VT1.isInteger() ||
35161 !VT2.isSimple() || !VT2.isInteger())
35162 return false;
35163
35164 switch (VT1.getSimpleVT().SimpleTy) {
35165 default: break;
35166 case MVT::i8:
35167 case MVT::i16:
35168 case MVT::i32:
35169 // X86 has 8, 16, and 32-bit zero-extending loads.
35170 return true;
35171 }
35172
35173 return false;
35174}
35175
35177 if (!Subtarget.is64Bit())
35178 return false;
35180}
35181
35183 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35184 return false;
35185
35186 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35187
35188 // There is no extending load for vXi1.
35189 if (SrcVT.getScalarType() == MVT::i1)
35190 return false;
35191
35192 return true;
35193}
35194
35196 EVT VT) const {
35197 if (Subtarget.useSoftFloat())
35198 return false;
35199
35200 if (!Subtarget.hasAnyFMA())
35201 return false;
35202
35203 VT = VT.getScalarType();
35204
35205 if (!VT.isSimple())
35206 return false;
35207
35208 switch (VT.getSimpleVT().SimpleTy) {
35209 case MVT::f16:
35210 return Subtarget.hasFP16();
35211 case MVT::f32:
35212 case MVT::f64:
35213 return true;
35214 default:
35215 break;
35216 }
35217
35218 return false;
35219}
35220
35222 EVT DestVT) const {
35223 // i16 instructions are longer (0x66 prefix) and potentially slower.
35224 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35225}
35226
35228 EVT VT) const {
35229 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35230 // benefit. The transform may also be profitable for scalar code.
35231 if (!Subtarget.hasAVX512())
35232 return false;
35233 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35234 return false;
35235 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35236 return false;
35237
35238 return true;
35239}
35240
35241/// Targets can use this to indicate that they only support *some*
35242/// VECTOR_SHUFFLE operations, those with specific masks.
35243/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35244/// are assumed to be legal.
35246 if (!VT.isSimple())
35247 return false;
35248
35249 // Not for i1 vectors
35250 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35251 return false;
35252
35253 // Very little shuffling can be done for 64-bit vectors right now.
35254 if (VT.getSimpleVT().getSizeInBits() == 64)
35255 return false;
35256
35257 // We only care that the types being shuffled are legal. The lowering can
35258 // handle any possible shuffle mask that results.
35259 return isTypeLegal(VT.getSimpleVT());
35260}
35261
35263 EVT VT) const {
35264 // Don't convert an 'and' into a shuffle that we don't directly support.
35265 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35266 if (!Subtarget.hasAVX2())
35267 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35268 return false;
35269
35270 // Just delegate to the generic legality, clear masks aren't special.
35271 return isShuffleMaskLegal(Mask, VT);
35272}
35273
35275 // If the subtarget is using thunks, we need to not generate jump tables.
35276 if (Subtarget.useIndirectThunkBranches())
35277 return false;
35278
35279 // Otherwise, fallback on the generic logic.
35281}
35282
35284 EVT ConditionVT) const {
35285 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35286 // zero-extensions.
35287 if (ConditionVT.getSizeInBits() < 32)
35288 return MVT::i32;
35290 ConditionVT);
35291}
35292
35293//===----------------------------------------------------------------------===//
35294// X86 Scheduler Hooks
35295//===----------------------------------------------------------------------===//
35296
35297// Returns true if EFLAG is consumed after this iterator in the rest of the
35298// basic block or any successors of the basic block.
35300 MachineBasicBlock *BB) {
35301 // Scan forward through BB for a use/def of EFLAGS.
35302 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
35303 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
35304 return true;
35305 // If we found a def, we can stop searching.
35306 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
35307 return false;
35308 }
35309
35310 // If we hit the end of the block, check whether EFLAGS is live into a
35311 // successor.
35312 for (MachineBasicBlock *Succ : BB->successors())
35313 if (Succ->isLiveIn(X86::EFLAGS))
35314 return true;
35315
35316 return false;
35317}
35318
35319/// Utility function to emit xbegin specifying the start of an RTM region.
35321 const TargetInstrInfo *TII) {
35322 const MIMetadata MIMD(MI);
35323
35324 const BasicBlock *BB = MBB->getBasicBlock();
35326
35327 // For the v = xbegin(), we generate
35328 //
35329 // thisMBB:
35330 // xbegin sinkMBB
35331 //
35332 // mainMBB:
35333 // s0 = -1
35334 //
35335 // fallBB:
35336 // eax = # XABORT_DEF
35337 // s1 = eax
35338 //
35339 // sinkMBB:
35340 // v = phi(s0/mainBB, s1/fallBB)
35341
35342 MachineBasicBlock *thisMBB = MBB;
35343 MachineFunction *MF = MBB->getParent();
35344 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35345 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35346 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35347 MF->insert(I, mainMBB);
35348 MF->insert(I, fallMBB);
35349 MF->insert(I, sinkMBB);
35350
35351 if (isEFLAGSLiveAfter(MI, MBB)) {
35352 mainMBB->addLiveIn(X86::EFLAGS);
35353 fallMBB->addLiveIn(X86::EFLAGS);
35354 sinkMBB->addLiveIn(X86::EFLAGS);
35355 }
35356
35357 // Transfer the remainder of BB and its successor edges to sinkMBB.
35358 sinkMBB->splice(sinkMBB->begin(), MBB,
35359 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35361
35363 Register DstReg = MI.getOperand(0).getReg();
35364 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35365 Register mainDstReg = MRI.createVirtualRegister(RC);
35366 Register fallDstReg = MRI.createVirtualRegister(RC);
35367
35368 // thisMBB:
35369 // xbegin fallMBB
35370 // # fallthrough to mainMBB
35371 // # abortion to fallMBB
35372 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35373 thisMBB->addSuccessor(mainMBB);
35374 thisMBB->addSuccessor(fallMBB);
35375
35376 // mainMBB:
35377 // mainDstReg := -1
35378 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35379 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35380 mainMBB->addSuccessor(sinkMBB);
35381
35382 // fallMBB:
35383 // ; pseudo instruction to model hardware's definition from XABORT
35384 // EAX := XABORT_DEF
35385 // fallDstReg := EAX
35386 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35387 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35388 .addReg(X86::EAX);
35389 fallMBB->addSuccessor(sinkMBB);
35390
35391 // sinkMBB:
35392 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35393 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35394 .addReg(mainDstReg).addMBB(mainMBB)
35395 .addReg(fallDstReg).addMBB(fallMBB);
35396
35397 MI.eraseFromParent();
35398 return sinkMBB;
35399}
35400
35402X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35403 MachineBasicBlock *MBB) const {
35404 // Emit va_arg instruction on X86-64.
35405
35406 // Operands to this pseudo-instruction:
35407 // 0 ) Output : destination address (reg)
35408 // 1-5) Input : va_list address (addr, i64mem)
35409 // 6 ) ArgSize : Size (in bytes) of vararg type
35410 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35411 // 8 ) Align : Alignment of type
35412 // 9 ) EFLAGS (implicit-def)
35413
35414 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35415 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35416
35417 Register DestReg = MI.getOperand(0).getReg();
35418 MachineOperand &Base = MI.getOperand(1);
35419 MachineOperand &Scale = MI.getOperand(2);
35420 MachineOperand &Index = MI.getOperand(3);
35421 MachineOperand &Disp = MI.getOperand(4);
35422 MachineOperand &Segment = MI.getOperand(5);
35423 unsigned ArgSize = MI.getOperand(6).getImm();
35424 unsigned ArgMode = MI.getOperand(7).getImm();
35425 Align Alignment = Align(MI.getOperand(8).getImm());
35426
35427 MachineFunction *MF = MBB->getParent();
35428
35429 // Memory Reference
35430 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35431
35432 MachineMemOperand *OldMMO = MI.memoperands().front();
35433
35434 // Clone the MMO into two separate MMOs for loading and storing
35435 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35436 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35437 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35438 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35439
35440 // Machine Information
35441 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35443 const TargetRegisterClass *AddrRegClass =
35445 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35446 const MIMetadata MIMD(MI);
35447
35448 // struct va_list {
35449 // i32 gp_offset
35450 // i32 fp_offset
35451 // i64 overflow_area (address)
35452 // i64 reg_save_area (address)
35453 // }
35454 // sizeof(va_list) = 24
35455 // alignment(va_list) = 8
35456
35457 unsigned TotalNumIntRegs = 6;
35458 unsigned TotalNumXMMRegs = 8;
35459 bool UseGPOffset = (ArgMode == 1);
35460 bool UseFPOffset = (ArgMode == 2);
35461 unsigned MaxOffset = TotalNumIntRegs * 8 +
35462 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35463
35464 /* Align ArgSize to a multiple of 8 */
35465 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35466 bool NeedsAlign = (Alignment > 8);
35467
35468 MachineBasicBlock *thisMBB = MBB;
35469 MachineBasicBlock *overflowMBB;
35470 MachineBasicBlock *offsetMBB;
35471 MachineBasicBlock *endMBB;
35472
35473 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
35474 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
35475 unsigned OffsetReg = 0;
35476
35477 if (!UseGPOffset && !UseFPOffset) {
35478 // If we only pull from the overflow region, we don't create a branch.
35479 // We don't need to alter control flow.
35480 OffsetDestReg = 0; // unused
35481 OverflowDestReg = DestReg;
35482
35483 offsetMBB = nullptr;
35484 overflowMBB = thisMBB;
35485 endMBB = thisMBB;
35486 } else {
35487 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35488 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35489 // If not, pull from overflow_area. (branch to overflowMBB)
35490 //
35491 // thisMBB
35492 // | .
35493 // | .
35494 // offsetMBB overflowMBB
35495 // | .
35496 // | .
35497 // endMBB
35498
35499 // Registers for the PHI in endMBB
35500 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35501 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35502
35503 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35504 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35505 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35506 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35507
35509
35510 // Insert the new basic blocks
35511 MF->insert(MBBIter, offsetMBB);
35512 MF->insert(MBBIter, overflowMBB);
35513 MF->insert(MBBIter, endMBB);
35514
35515 // Transfer the remainder of MBB and its successor edges to endMBB.
35516 endMBB->splice(endMBB->begin(), thisMBB,
35517 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35518 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35519
35520 // Make offsetMBB and overflowMBB successors of thisMBB
35521 thisMBB->addSuccessor(offsetMBB);
35522 thisMBB->addSuccessor(overflowMBB);
35523
35524 // endMBB is a successor of both offsetMBB and overflowMBB
35525 offsetMBB->addSuccessor(endMBB);
35526 overflowMBB->addSuccessor(endMBB);
35527
35528 // Load the offset value into a register
35529 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35530 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35531 .add(Base)
35532 .add(Scale)
35533 .add(Index)
35534 .addDisp(Disp, UseFPOffset ? 4 : 0)
35535 .add(Segment)
35536 .setMemRefs(LoadOnlyMMO);
35537
35538 // Check if there is enough room left to pull this argument.
35539 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35540 .addReg(OffsetReg)
35541 .addImm(MaxOffset + 8 - ArgSizeA8);
35542
35543 // Branch to "overflowMBB" if offset >= max
35544 // Fall through to "offsetMBB" otherwise
35545 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35546 .addMBB(overflowMBB).addImm(X86::COND_AE);
35547 }
35548
35549 // In offsetMBB, emit code to use the reg_save_area.
35550 if (offsetMBB) {
35551 assert(OffsetReg != 0);
35552
35553 // Read the reg_save_area address.
35554 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35555 BuildMI(
35556 offsetMBB, MIMD,
35557 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35558 RegSaveReg)
35559 .add(Base)
35560 .add(Scale)
35561 .add(Index)
35562 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35563 .add(Segment)
35564 .setMemRefs(LoadOnlyMMO);
35565
35566 if (Subtarget.isTarget64BitLP64()) {
35567 // Zero-extend the offset
35568 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35569 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35570 .addImm(0)
35571 .addReg(OffsetReg)
35572 .addImm(X86::sub_32bit);
35573
35574 // Add the offset to the reg_save_area to get the final address.
35575 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35576 .addReg(OffsetReg64)
35577 .addReg(RegSaveReg);
35578 } else {
35579 // Add the offset to the reg_save_area to get the final address.
35580 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35581 .addReg(OffsetReg)
35582 .addReg(RegSaveReg);
35583 }
35584
35585 // Compute the offset for the next argument
35586 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35587 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35588 .addReg(OffsetReg)
35589 .addImm(UseFPOffset ? 16 : 8);
35590
35591 // Store it back into the va_list.
35592 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
35593 .add(Base)
35594 .add(Scale)
35595 .add(Index)
35596 .addDisp(Disp, UseFPOffset ? 4 : 0)
35597 .add(Segment)
35598 .addReg(NextOffsetReg)
35599 .setMemRefs(StoreOnlyMMO);
35600
35601 // Jump to endMBB
35602 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
35603 .addMBB(endMBB);
35604 }
35605
35606 //
35607 // Emit code to use overflow area
35608 //
35609
35610 // Load the overflow_area address into a register.
35611 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
35612 BuildMI(overflowMBB, MIMD,
35613 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35614 OverflowAddrReg)
35615 .add(Base)
35616 .add(Scale)
35617 .add(Index)
35618 .addDisp(Disp, 8)
35619 .add(Segment)
35620 .setMemRefs(LoadOnlyMMO);
35621
35622 // If we need to align it, do so. Otherwise, just copy the address
35623 // to OverflowDestReg.
35624 if (NeedsAlign) {
35625 // Align the overflow address
35626 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
35627
35628 // aligned_addr = (addr + (align-1)) & ~(align-1)
35629 BuildMI(
35630 overflowMBB, MIMD,
35631 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35632 TmpReg)
35633 .addReg(OverflowAddrReg)
35634 .addImm(Alignment.value() - 1);
35635
35636 BuildMI(
35637 overflowMBB, MIMD,
35638 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
35639 OverflowDestReg)
35640 .addReg(TmpReg)
35641 .addImm(~(uint64_t)(Alignment.value() - 1));
35642 } else {
35643 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
35644 .addReg(OverflowAddrReg);
35645 }
35646
35647 // Compute the next overflow address after this argument.
35648 // (the overflow address should be kept 8-byte aligned)
35649 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
35650 BuildMI(
35651 overflowMBB, MIMD,
35652 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
35653 NextAddrReg)
35654 .addReg(OverflowDestReg)
35655 .addImm(ArgSizeA8);
35656
35657 // Store the new overflow address.
35658 BuildMI(overflowMBB, MIMD,
35659 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
35660 .add(Base)
35661 .add(Scale)
35662 .add(Index)
35663 .addDisp(Disp, 8)
35664 .add(Segment)
35665 .addReg(NextAddrReg)
35666 .setMemRefs(StoreOnlyMMO);
35667
35668 // If we branched, emit the PHI to the front of endMBB.
35669 if (offsetMBB) {
35670 BuildMI(*endMBB, endMBB->begin(), MIMD,
35671 TII->get(X86::PHI), DestReg)
35672 .addReg(OffsetDestReg).addMBB(offsetMBB)
35673 .addReg(OverflowDestReg).addMBB(overflowMBB);
35674 }
35675
35676 // Erase the pseudo instruction
35677 MI.eraseFromParent();
35678
35679 return endMBB;
35680}
35681
35682// The EFLAGS operand of SelectItr might be missing a kill marker
35683// because there were multiple uses of EFLAGS, and ISel didn't know
35684// which to mark. Figure out whether SelectItr should have had a
35685// kill marker, and set it if it should. Returns the correct kill
35686// marker value.
35689 const TargetRegisterInfo* TRI) {
35690 if (isEFLAGSLiveAfter(SelectItr, BB))
35691 return false;
35692
35693 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
35694 // out. SelectMI should have a kill flag on EFLAGS.
35695 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
35696 return true;
35697}
35698
35699// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
35700// together with other CMOV pseudo-opcodes into a single basic-block with
35701// conditional jump around it.
35703 switch (MI.getOpcode()) {
35704 case X86::CMOV_FR16:
35705 case X86::CMOV_FR16X:
35706 case X86::CMOV_FR32:
35707 case X86::CMOV_FR32X:
35708 case X86::CMOV_FR64:
35709 case X86::CMOV_FR64X:
35710 case X86::CMOV_GR8:
35711 case X86::CMOV_GR16:
35712 case X86::CMOV_GR32:
35713 case X86::CMOV_RFP32:
35714 case X86::CMOV_RFP64:
35715 case X86::CMOV_RFP80:
35716 case X86::CMOV_VR64:
35717 case X86::CMOV_VR128:
35718 case X86::CMOV_VR128X:
35719 case X86::CMOV_VR256:
35720 case X86::CMOV_VR256X:
35721 case X86::CMOV_VR512:
35722 case X86::CMOV_VK1:
35723 case X86::CMOV_VK2:
35724 case X86::CMOV_VK4:
35725 case X86::CMOV_VK8:
35726 case X86::CMOV_VK16:
35727 case X86::CMOV_VK32:
35728 case X86::CMOV_VK64:
35729 return true;
35730
35731 default:
35732 return false;
35733 }
35734}
35735
35736// Helper function, which inserts PHI functions into SinkMBB:
35737// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
35738// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
35739// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
35740// the last PHI function inserted.
35743 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
35744 MachineBasicBlock *SinkMBB) {
35745 MachineFunction *MF = TrueMBB->getParent();
35747 const MIMetadata MIMD(*MIItBegin);
35748
35749 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
35751
35752 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
35753
35754 // As we are creating the PHIs, we have to be careful if there is more than
35755 // one. Later CMOVs may reference the results of earlier CMOVs, but later
35756 // PHIs have to reference the individual true/false inputs from earlier PHIs.
35757 // That also means that PHI construction must work forward from earlier to
35758 // later, and that the code must maintain a mapping from earlier PHI's
35759 // destination registers, and the registers that went into the PHI.
35762
35763 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
35764 Register DestReg = MIIt->getOperand(0).getReg();
35765 Register Op1Reg = MIIt->getOperand(1).getReg();
35766 Register Op2Reg = MIIt->getOperand(2).getReg();
35767
35768 // If this CMOV we are generating is the opposite condition from
35769 // the jump we generated, then we have to swap the operands for the
35770 // PHI that is going to be generated.
35771 if (MIIt->getOperand(3).getImm() == OppCC)
35772 std::swap(Op1Reg, Op2Reg);
35773
35774 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
35775 Op1Reg = It->second.first;
35776
35777 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
35778 Op2Reg = It->second.second;
35779
35780 MIB =
35781 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
35782 .addReg(Op1Reg)
35783 .addMBB(FalseMBB)
35784 .addReg(Op2Reg)
35785 .addMBB(TrueMBB);
35786
35787 // Add this PHI to the rewrite table.
35788 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
35789 }
35790
35791 return MIB;
35792}
35793
35794// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
35796X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
35797 MachineInstr &SecondCascadedCMOV,
35798 MachineBasicBlock *ThisMBB) const {
35799 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35800 const MIMetadata MIMD(FirstCMOV);
35801
35802 // We lower cascaded CMOVs such as
35803 //
35804 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
35805 //
35806 // to two successive branches.
35807 //
35808 // Without this, we would add a PHI between the two jumps, which ends up
35809 // creating a few copies all around. For instance, for
35810 //
35811 // (sitofp (zext (fcmp une)))
35812 //
35813 // we would generate:
35814 //
35815 // ucomiss %xmm1, %xmm0
35816 // movss <1.0f>, %xmm0
35817 // movaps %xmm0, %xmm1
35818 // jne .LBB5_2
35819 // xorps %xmm1, %xmm1
35820 // .LBB5_2:
35821 // jp .LBB5_4
35822 // movaps %xmm1, %xmm0
35823 // .LBB5_4:
35824 // retq
35825 //
35826 // because this custom-inserter would have generated:
35827 //
35828 // A
35829 // | \
35830 // | B
35831 // | /
35832 // C
35833 // | \
35834 // | D
35835 // | /
35836 // E
35837 //
35838 // A: X = ...; Y = ...
35839 // B: empty
35840 // C: Z = PHI [X, A], [Y, B]
35841 // D: empty
35842 // E: PHI [X, C], [Z, D]
35843 //
35844 // If we lower both CMOVs in a single step, we can instead generate:
35845 //
35846 // A
35847 // | \
35848 // | C
35849 // | /|
35850 // |/ |
35851 // | |
35852 // | D
35853 // | /
35854 // E
35855 //
35856 // A: X = ...; Y = ...
35857 // D: empty
35858 // E: PHI [X, A], [X, C], [Y, D]
35859 //
35860 // Which, in our sitofp/fcmp example, gives us something like:
35861 //
35862 // ucomiss %xmm1, %xmm0
35863 // movss <1.0f>, %xmm0
35864 // jne .LBB5_4
35865 // jp .LBB5_4
35866 // xorps %xmm0, %xmm0
35867 // .LBB5_4:
35868 // retq
35869 //
35870
35871 // We lower cascaded CMOV into two successive branches to the same block.
35872 // EFLAGS is used by both, so mark it as live in the second.
35873 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35874 MachineFunction *F = ThisMBB->getParent();
35875 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35876 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
35877 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35878
35879 MachineFunction::iterator It = ++ThisMBB->getIterator();
35880 F->insert(It, FirstInsertedMBB);
35881 F->insert(It, SecondInsertedMBB);
35882 F->insert(It, SinkMBB);
35883
35884 // For a cascaded CMOV, we lower it to two successive branches to
35885 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
35886 // the FirstInsertedMBB.
35887 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
35888
35889 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35890 // live into the sink and copy blocks.
35891 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35892 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35893 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
35894 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
35895 SinkMBB->addLiveIn(X86::EFLAGS);
35896 }
35897
35898 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35899 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
35900 std::next(MachineBasicBlock::iterator(FirstCMOV)),
35901 ThisMBB->end());
35902 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35903
35904 // Fallthrough block for ThisMBB.
35905 ThisMBB->addSuccessor(FirstInsertedMBB);
35906 // The true block target of the first branch is always SinkMBB.
35907 ThisMBB->addSuccessor(SinkMBB);
35908 // Fallthrough block for FirstInsertedMBB.
35909 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
35910 // The true block for the branch of FirstInsertedMBB.
35911 FirstInsertedMBB->addSuccessor(SinkMBB);
35912 // This is fallthrough.
35913 SecondInsertedMBB->addSuccessor(SinkMBB);
35914
35915 // Create the conditional branch instructions.
35916 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
35917 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
35918
35919 X86::CondCode SecondCC =
35920 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
35921 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
35922 .addMBB(SinkMBB)
35923 .addImm(SecondCC);
35924
35925 // SinkMBB:
35926 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
35927 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
35928 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
35929 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
35931 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
35932 .addReg(Op1Reg)
35933 .addMBB(SecondInsertedMBB)
35934 .addReg(Op2Reg)
35935 .addMBB(ThisMBB);
35936
35937 // The second SecondInsertedMBB provides the same incoming value as the
35938 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
35939 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
35940
35941 // Now remove the CMOVs.
35942 FirstCMOV.eraseFromParent();
35943 SecondCascadedCMOV.eraseFromParent();
35944
35945 return SinkMBB;
35946}
35947
35949X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
35950 MachineBasicBlock *ThisMBB) const {
35951 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35952 const MIMetadata MIMD(MI);
35953
35954 // To "insert" a SELECT_CC instruction, we actually have to insert the
35955 // diamond control-flow pattern. The incoming instruction knows the
35956 // destination vreg to set, the condition code register to branch on, the
35957 // true/false values to select between and a branch opcode to use.
35958
35959 // ThisMBB:
35960 // ...
35961 // TrueVal = ...
35962 // cmpTY ccX, r1, r2
35963 // bCC copy1MBB
35964 // fallthrough --> FalseMBB
35965
35966 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
35967 // as described above, by inserting a BB, and then making a PHI at the join
35968 // point to select the true and false operands of the CMOV in the PHI.
35969 //
35970 // The code also handles two different cases of multiple CMOV opcodes
35971 // in a row.
35972 //
35973 // Case 1:
35974 // In this case, there are multiple CMOVs in a row, all which are based on
35975 // the same condition setting (or the exact opposite condition setting).
35976 // In this case we can lower all the CMOVs using a single inserted BB, and
35977 // then make a number of PHIs at the join point to model the CMOVs. The only
35978 // trickiness here, is that in a case like:
35979 //
35980 // t2 = CMOV cond1 t1, f1
35981 // t3 = CMOV cond1 t2, f2
35982 //
35983 // when rewriting this into PHIs, we have to perform some renaming on the
35984 // temps since you cannot have a PHI operand refer to a PHI result earlier
35985 // in the same block. The "simple" but wrong lowering would be:
35986 //
35987 // t2 = PHI t1(BB1), f1(BB2)
35988 // t3 = PHI t2(BB1), f2(BB2)
35989 //
35990 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
35991 // renaming is to note that on the path through BB1, t2 is really just a
35992 // copy of t1, and do that renaming, properly generating:
35993 //
35994 // t2 = PHI t1(BB1), f1(BB2)
35995 // t3 = PHI t1(BB1), f2(BB2)
35996 //
35997 // Case 2:
35998 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
35999 // function - EmitLoweredCascadedSelect.
36000
36001 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36003 MachineInstr *LastCMOV = &MI;
36005
36006 // Check for case 1, where there are multiple CMOVs with the same condition
36007 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36008 // number of jumps the most.
36009
36010 if (isCMOVPseudo(MI)) {
36011 // See if we have a string of CMOVS with the same condition. Skip over
36012 // intervening debug insts.
36013 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36014 (NextMIIt->getOperand(3).getImm() == CC ||
36015 NextMIIt->getOperand(3).getImm() == OppCC)) {
36016 LastCMOV = &*NextMIIt;
36017 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36018 }
36019 }
36020
36021 // This checks for case 2, but only do this if we didn't already find
36022 // case 1, as indicated by LastCMOV == MI.
36023 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36024 NextMIIt->getOpcode() == MI.getOpcode() &&
36025 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36026 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36027 NextMIIt->getOperand(1).isKill()) {
36028 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36029 }
36030
36031 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36032 MachineFunction *F = ThisMBB->getParent();
36033 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36034 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36035
36036 MachineFunction::iterator It = ++ThisMBB->getIterator();
36037 F->insert(It, FalseMBB);
36038 F->insert(It, SinkMBB);
36039
36040 // Set the call frame size on entry to the new basic blocks.
36041 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36042 FalseMBB->setCallFrameSize(CallFrameSize);
36043 SinkMBB->setCallFrameSize(CallFrameSize);
36044
36045 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36046 // live into the sink and copy blocks.
36047 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36048 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36049 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36050 FalseMBB->addLiveIn(X86::EFLAGS);
36051 SinkMBB->addLiveIn(X86::EFLAGS);
36052 }
36053
36054 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36056 MachineBasicBlock::iterator(LastCMOV));
36057 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36058 if (MI.isDebugInstr())
36059 SinkMBB->push_back(MI.removeFromParent());
36060
36061 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36062 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36063 std::next(MachineBasicBlock::iterator(LastCMOV)),
36064 ThisMBB->end());
36065 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36066
36067 // Fallthrough block for ThisMBB.
36068 ThisMBB->addSuccessor(FalseMBB);
36069 // The true block target of the first (or only) branch is always a SinkMBB.
36070 ThisMBB->addSuccessor(SinkMBB);
36071 // Fallthrough block for FalseMBB.
36072 FalseMBB->addSuccessor(SinkMBB);
36073
36074 // Create the conditional branch instruction.
36075 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36076
36077 // SinkMBB:
36078 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36079 // ...
36082 std::next(MachineBasicBlock::iterator(LastCMOV));
36083 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36084
36085 // Now remove the CMOV(s).
36086 ThisMBB->erase(MIItBegin, MIItEnd);
36087
36088 return SinkMBB;
36089}
36090
36091static unsigned getSUBriOpcode(bool IsLP64) {
36092 if (IsLP64)
36093 return X86::SUB64ri32;
36094 else
36095 return X86::SUB32ri;
36096}
36097
36099X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36100 MachineBasicBlock *MBB) const {
36101 MachineFunction *MF = MBB->getParent();
36102 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36103 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36104 const MIMetadata MIMD(MI);
36105 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36106
36107 const unsigned ProbeSize = getStackProbeSize(*MF);
36108
36110 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36111 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36112 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36113
36115 MF->insert(MBBIter, testMBB);
36116 MF->insert(MBBIter, blockMBB);
36117 MF->insert(MBBIter, tailMBB);
36118
36119 Register sizeVReg = MI.getOperand(1).getReg();
36120
36121 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36122
36123 Register TmpStackPtr = MRI.createVirtualRegister(
36124 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36125 Register FinalStackPtr = MRI.createVirtualRegister(
36126 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36127
36128 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36129 .addReg(physSPReg);
36130 {
36131 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36132 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36133 .addReg(TmpStackPtr)
36134 .addReg(sizeVReg);
36135 }
36136
36137 // test rsp size
36138
36139 BuildMI(testMBB, MIMD,
36140 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36141 .addReg(FinalStackPtr)
36142 .addReg(physSPReg);
36143
36144 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36145 .addMBB(tailMBB)
36147 testMBB->addSuccessor(blockMBB);
36148 testMBB->addSuccessor(tailMBB);
36149
36150 // Touch the block then extend it. This is done on the opposite side of
36151 // static probe where we allocate then touch, to avoid the need of probing the
36152 // tail of the static alloca. Possible scenarios are:
36153 //
36154 // + ---- <- ------------ <- ------------- <- ------------ +
36155 // | |
36156 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36157 // | |
36158 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36159 //
36160 // The property we want to enforce is to never have more than [page alloc] between two probes.
36161
36162 const unsigned XORMIOpc =
36163 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36164 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36165 .addImm(0);
36166
36167 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36168 physSPReg)
36169 .addReg(physSPReg)
36170 .addImm(ProbeSize);
36171
36172 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36173 blockMBB->addSuccessor(testMBB);
36174
36175 // Replace original instruction by the expected stack ptr
36176 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36177 MI.getOperand(0).getReg())
36178 .addReg(FinalStackPtr);
36179
36180 tailMBB->splice(tailMBB->end(), MBB,
36181 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36183 MBB->addSuccessor(testMBB);
36184
36185 // Delete the original pseudo instruction.
36186 MI.eraseFromParent();
36187
36188 // And we're done.
36189 return tailMBB;
36190}
36191
36193X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36194 MachineBasicBlock *BB) const {
36195 MachineFunction *MF = BB->getParent();
36196 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36197 const MIMetadata MIMD(MI);
36198 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36199
36200 assert(MF->shouldSplitStack());
36201
36202 const bool Is64Bit = Subtarget.is64Bit();
36203 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36204
36205 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36206 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36207
36208 // BB:
36209 // ... [Till the alloca]
36210 // If stacklet is not large enough, jump to mallocMBB
36211 //
36212 // bumpMBB:
36213 // Allocate by subtracting from RSP
36214 // Jump to continueMBB
36215 //
36216 // mallocMBB:
36217 // Allocate by call to runtime
36218 //
36219 // continueMBB:
36220 // ...
36221 // [rest of original BB]
36222 //
36223
36224 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36225 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36226 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36227
36229 const TargetRegisterClass *AddrRegClass =
36231
36232 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36233 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36234 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36235 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36236 sizeVReg = MI.getOperand(1).getReg(),
36237 physSPReg =
36238 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
36239
36240 MachineFunction::iterator MBBIter = ++BB->getIterator();
36241
36242 MF->insert(MBBIter, bumpMBB);
36243 MF->insert(MBBIter, mallocMBB);
36244 MF->insert(MBBIter, continueMBB);
36245
36246 continueMBB->splice(continueMBB->begin(), BB,
36247 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36248 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36249
36250 // Add code to the main basic block to check if the stack limit has been hit,
36251 // and if so, jump to mallocMBB otherwise to bumpMBB.
36252 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36253 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36254 .addReg(tmpSPVReg).addReg(sizeVReg);
36255 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36256 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36257 .addReg(SPLimitVReg);
36258 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36259
36260 // bumpMBB simply decreases the stack pointer, since we know the current
36261 // stacklet has enough space.
36262 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36263 .addReg(SPLimitVReg);
36264 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36265 .addReg(SPLimitVReg);
36266 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36267
36268 // Calls into a routine in libgcc to allocate more space from the heap.
36269 const uint32_t *RegMask =
36271 if (IsLP64) {
36272 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36273 .addReg(sizeVReg);
36274 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36275 .addExternalSymbol("__morestack_allocate_stack_space")
36276 .addRegMask(RegMask)
36277 .addReg(X86::RDI, RegState::Implicit)
36278 .addReg(X86::RAX, RegState::ImplicitDefine);
36279 } else if (Is64Bit) {
36280 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36281 .addReg(sizeVReg);
36282 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36283 .addExternalSymbol("__morestack_allocate_stack_space")
36284 .addRegMask(RegMask)
36285 .addReg(X86::EDI, RegState::Implicit)
36286 .addReg(X86::EAX, RegState::ImplicitDefine);
36287 } else {
36288 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36289 .addImm(12);
36290 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36291 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36292 .addExternalSymbol("__morestack_allocate_stack_space")
36293 .addRegMask(RegMask)
36294 .addReg(X86::EAX, RegState::ImplicitDefine);
36295 }
36296
36297 if (!Is64Bit)
36298 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36299 .addImm(16);
36300
36301 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36302 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36303 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36304
36305 // Set up the CFG correctly.
36306 BB->addSuccessor(bumpMBB);
36307 BB->addSuccessor(mallocMBB);
36308 mallocMBB->addSuccessor(continueMBB);
36309 bumpMBB->addSuccessor(continueMBB);
36310
36311 // Take care of the PHI nodes.
36312 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36313 MI.getOperand(0).getReg())
36314 .addReg(mallocPtrVReg)
36315 .addMBB(mallocMBB)
36316 .addReg(bumpSPPtrVReg)
36317 .addMBB(bumpMBB);
36318
36319 // Delete the original pseudo instruction.
36320 MI.eraseFromParent();
36321
36322 // And we're done.
36323 return continueMBB;
36324}
36325
36327X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36328 MachineBasicBlock *BB) const {
36329 MachineFunction *MF = BB->getParent();
36330 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36331 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36332 const MIMetadata MIMD(MI);
36333
36336 "SEH does not use catchret!");
36337
36338 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36339 if (!Subtarget.is32Bit())
36340 return BB;
36341
36342 // C++ EH creates a new target block to hold the restore code, and wires up
36343 // the new block to the return destination with a normal JMP_4.
36344 MachineBasicBlock *RestoreMBB =
36346 assert(BB->succ_size() == 1);
36347 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36348 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36349 BB->addSuccessor(RestoreMBB);
36350 MI.getOperand(0).setMBB(RestoreMBB);
36351
36352 // Marking this as an EH pad but not a funclet entry block causes PEI to
36353 // restore stack pointers in the block.
36354 RestoreMBB->setIsEHPad(true);
36355
36356 auto RestoreMBBI = RestoreMBB->begin();
36357 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36358 return BB;
36359}
36360
36362X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36363 MachineBasicBlock *BB) const {
36364 // This is pretty easy. We're taking the value that we received from
36365 // our load from the relocation, sticking it in either RDI (x86-64)
36366 // or EAX and doing an indirect call. The return value will then
36367 // be in the normal return register.
36368 MachineFunction *F = BB->getParent();
36369 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36370 const MIMetadata MIMD(MI);
36371
36372 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36373 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36374
36375 // Get a register mask for the lowered call.
36376 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36377 // proper register mask.
36378 const uint32_t *RegMask =
36379 Subtarget.is64Bit() ?
36382 if (Subtarget.is64Bit()) {
36384 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36385 .addReg(X86::RIP)
36386 .addImm(0)
36387 .addReg(0)
36388 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36389 MI.getOperand(3).getTargetFlags())
36390 .addReg(0);
36391 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36392 addDirectMem(MIB, X86::RDI);
36393 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36394 } else if (!isPositionIndependent()) {
36396 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36397 .addReg(0)
36398 .addImm(0)
36399 .addReg(0)
36400 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36401 MI.getOperand(3).getTargetFlags())
36402 .addReg(0);
36403 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36404 addDirectMem(MIB, X86::EAX);
36405 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36406 } else {
36408 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36409 .addReg(TII->getGlobalBaseReg(F))
36410 .addImm(0)
36411 .addReg(0)
36412 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36413 MI.getOperand(3).getTargetFlags())
36414 .addReg(0);
36415 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36416 addDirectMem(MIB, X86::EAX);
36417 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36418 }
36419
36420 MI.eraseFromParent(); // The pseudo instruction is gone now.
36421 return BB;
36422}
36423
36424static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36425 switch (RPOpc) {
36426 case X86::INDIRECT_THUNK_CALL32:
36427 return X86::CALLpcrel32;
36428 case X86::INDIRECT_THUNK_CALL64:
36429 return X86::CALL64pcrel32;
36430 case X86::INDIRECT_THUNK_TCRETURN32:
36431 return X86::TCRETURNdi;
36432 case X86::INDIRECT_THUNK_TCRETURN64:
36433 return X86::TCRETURNdi64;
36434 }
36435 llvm_unreachable("not indirect thunk opcode");
36436}
36437
36438static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36439 unsigned Reg) {
36440 if (Subtarget.useRetpolineExternalThunk()) {
36441 // When using an external thunk for retpolines, we pick names that match the
36442 // names GCC happens to use as well. This helps simplify the implementation
36443 // of the thunks for kernels where they have no easy ability to create
36444 // aliases and are doing non-trivial configuration of the thunk's body. For
36445 // example, the Linux kernel will do boot-time hot patching of the thunk
36446 // bodies and cannot easily export aliases of these to loaded modules.
36447 //
36448 // Note that at any point in the future, we may need to change the semantics
36449 // of how we implement retpolines and at that time will likely change the
36450 // name of the called thunk. Essentially, there is no hard guarantee that
36451 // LLVM will generate calls to specific thunks, we merely make a best-effort
36452 // attempt to help out kernels and other systems where duplicating the
36453 // thunks is costly.
36454 switch (Reg) {
36455 case X86::EAX:
36456 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36457 return "__x86_indirect_thunk_eax";
36458 case X86::ECX:
36459 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36460 return "__x86_indirect_thunk_ecx";
36461 case X86::EDX:
36462 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36463 return "__x86_indirect_thunk_edx";
36464 case X86::EDI:
36465 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36466 return "__x86_indirect_thunk_edi";
36467 case X86::R11:
36468 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36469 return "__x86_indirect_thunk_r11";
36470 }
36471 llvm_unreachable("unexpected reg for external indirect thunk");
36472 }
36473
36474 if (Subtarget.useRetpolineIndirectCalls() ||
36475 Subtarget.useRetpolineIndirectBranches()) {
36476 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36477 switch (Reg) {
36478 case X86::EAX:
36479 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36480 return "__llvm_retpoline_eax";
36481 case X86::ECX:
36482 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36483 return "__llvm_retpoline_ecx";
36484 case X86::EDX:
36485 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36486 return "__llvm_retpoline_edx";
36487 case X86::EDI:
36488 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36489 return "__llvm_retpoline_edi";
36490 case X86::R11:
36491 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36492 return "__llvm_retpoline_r11";
36493 }
36494 llvm_unreachable("unexpected reg for retpoline");
36495 }
36496
36497 if (Subtarget.useLVIControlFlowIntegrity()) {
36498 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36499 return "__llvm_lvi_thunk_r11";
36500 }
36501 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36502}
36503
36505X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36506 MachineBasicBlock *BB) const {
36507 // Copy the virtual register into the R11 physical register and
36508 // call the retpoline thunk.
36509 const MIMetadata MIMD(MI);
36510 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36511 Register CalleeVReg = MI.getOperand(0).getReg();
36512 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36513
36514 // Find an available scratch register to hold the callee. On 64-bit, we can
36515 // just use R11, but we scan for uses anyway to ensure we don't generate
36516 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36517 // already a register use operand to the call to hold the callee. If none
36518 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36519 // register and ESI is the base pointer to realigned stack frames with VLAs.
36520 SmallVector<unsigned, 3> AvailableRegs;
36521 if (Subtarget.is64Bit())
36522 AvailableRegs.push_back(X86::R11);
36523 else
36524 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36525
36526 // Zero out any registers that are already used.
36527 for (const auto &MO : MI.operands()) {
36528 if (MO.isReg() && MO.isUse())
36529 llvm::replace(AvailableRegs, static_cast<unsigned>(MO.getReg()), 0U);
36530 }
36531
36532 // Choose the first remaining non-zero available register.
36533 unsigned AvailableReg = 0;
36534 for (unsigned MaybeReg : AvailableRegs) {
36535 if (MaybeReg) {
36536 AvailableReg = MaybeReg;
36537 break;
36538 }
36539 }
36540 if (!AvailableReg)
36541 report_fatal_error("calling convention incompatible with retpoline, no "
36542 "available registers");
36543
36544 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36545
36546 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36547 .addReg(CalleeVReg);
36548 MI.getOperand(0).ChangeToES(Symbol);
36549 MI.setDesc(TII->get(Opc));
36551 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36552 return BB;
36553}
36554
36555/// SetJmp implies future control flow change upon calling the corresponding
36556/// LongJmp.
36557/// Instead of using the 'return' instruction, the long jump fixes the stack and
36558/// performs an indirect branch. To do so it uses the registers that were stored
36559/// in the jump buffer (when calling SetJmp).
36560/// In case the shadow stack is enabled we need to fix it as well, because some
36561/// return addresses will be skipped.
36562/// The function will save the SSP for future fixing in the function
36563/// emitLongJmpShadowStackFix.
36564/// \sa emitLongJmpShadowStackFix
36565/// \param [in] MI The temporary Machine Instruction for the builtin.
36566/// \param [in] MBB The Machine Basic Block that will be modified.
36567void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36568 MachineBasicBlock *MBB) const {
36569 const MIMetadata MIMD(MI);
36570 MachineFunction *MF = MBB->getParent();
36571 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36574
36575 // Memory Reference.
36576 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36577
36578 // Initialize a register with zero.
36579 MVT PVT = getPointerTy(MF->getDataLayout());
36580 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36581 Register ZReg = MRI.createVirtualRegister(PtrRC);
36582 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36583 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36584 .addDef(ZReg)
36585 .addReg(ZReg, RegState::Undef)
36586 .addReg(ZReg, RegState::Undef);
36587
36588 // Read the current SSP Register value to the zeroed register.
36589 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36590 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36591 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36592
36593 // Write the SSP register value to offset 3 in input memory buffer.
36594 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36595 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
36596 const int64_t SSPOffset = 3 * PVT.getStoreSize();
36597 const unsigned MemOpndSlot = 1;
36598 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36599 if (i == X86::AddrDisp)
36600 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
36601 else
36602 MIB.add(MI.getOperand(MemOpndSlot + i));
36603 }
36604 MIB.addReg(SSPCopyReg);
36605 MIB.setMemRefs(MMOs);
36606}
36607
36609X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
36610 MachineBasicBlock *MBB) const {
36611 const MIMetadata MIMD(MI);
36612 MachineFunction *MF = MBB->getParent();
36613 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36614 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36616
36617 const BasicBlock *BB = MBB->getBasicBlock();
36619
36620 // Memory Reference
36621 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36622
36623 unsigned DstReg;
36624 unsigned MemOpndSlot = 0;
36625
36626 unsigned CurOp = 0;
36627
36628 DstReg = MI.getOperand(CurOp++).getReg();
36629 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
36630 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
36631 (void)TRI;
36632 Register mainDstReg = MRI.createVirtualRegister(RC);
36633 Register restoreDstReg = MRI.createVirtualRegister(RC);
36634
36635 MemOpndSlot = CurOp;
36636
36637 MVT PVT = getPointerTy(MF->getDataLayout());
36638 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36639 "Invalid Pointer Size!");
36640
36641 // For v = setjmp(buf), we generate
36642 //
36643 // thisMBB:
36644 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
36645 // SjLjSetup restoreMBB
36646 //
36647 // mainMBB:
36648 // v_main = 0
36649 //
36650 // sinkMBB:
36651 // v = phi(main, restore)
36652 //
36653 // restoreMBB:
36654 // if base pointer being used, load it from frame
36655 // v_restore = 1
36656
36657 MachineBasicBlock *thisMBB = MBB;
36658 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36659 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36660 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
36661 MF->insert(I, mainMBB);
36662 MF->insert(I, sinkMBB);
36663 MF->push_back(restoreMBB);
36664 restoreMBB->setMachineBlockAddressTaken();
36665
36667
36668 // Transfer the remainder of BB and its successor edges to sinkMBB.
36669 sinkMBB->splice(sinkMBB->begin(), MBB,
36670 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36672
36673 // thisMBB:
36674 unsigned PtrStoreOpc = 0;
36675 unsigned LabelReg = 0;
36676 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36677 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36679
36680 // Prepare IP either in reg or imm.
36681 if (!UseImmLabel) {
36682 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36683 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36684 LabelReg = MRI.createVirtualRegister(PtrRC);
36685 if (Subtarget.is64Bit()) {
36686 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
36687 .addReg(X86::RIP)
36688 .addImm(0)
36689 .addReg(0)
36690 .addMBB(restoreMBB)
36691 .addReg(0);
36692 } else {
36693 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
36694 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
36695 .addReg(XII->getGlobalBaseReg(MF))
36696 .addImm(0)
36697 .addReg(0)
36698 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
36699 .addReg(0);
36700 }
36701 } else
36702 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36703 // Store IP
36704 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
36705 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36706 if (i == X86::AddrDisp)
36707 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
36708 else
36709 MIB.add(MI.getOperand(MemOpndSlot + i));
36710 }
36711 if (!UseImmLabel)
36712 MIB.addReg(LabelReg);
36713 else
36714 MIB.addMBB(restoreMBB);
36715 MIB.setMemRefs(MMOs);
36716
36717 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36718 emitSetJmpShadowStackFix(MI, thisMBB);
36719 }
36720
36721 // Setup
36722 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
36723 .addMBB(restoreMBB);
36724
36725 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36726 MIB.addRegMask(RegInfo->getNoPreservedMask());
36727 thisMBB->addSuccessor(mainMBB);
36728 thisMBB->addSuccessor(restoreMBB);
36729
36730 // mainMBB:
36731 // EAX = 0
36732 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
36733 mainMBB->addSuccessor(sinkMBB);
36734
36735 // sinkMBB:
36736 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
36737 .addReg(mainDstReg)
36738 .addMBB(mainMBB)
36739 .addReg(restoreDstReg)
36740 .addMBB(restoreMBB);
36741
36742 // restoreMBB:
36743 if (RegInfo->hasBasePointer(*MF)) {
36744 const bool Uses64BitFramePtr =
36745 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36747 X86FI->setRestoreBasePointer(MF);
36748 Register FramePtr = RegInfo->getFrameRegister(*MF);
36749 Register BasePtr = RegInfo->getBaseRegister();
36750 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
36751 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
36752 FramePtr, true, X86FI->getRestoreBasePointerOffset())
36754 }
36755 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
36756 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36757 restoreMBB->addSuccessor(sinkMBB);
36758
36759 MI.eraseFromParent();
36760 return sinkMBB;
36761}
36762
36763/// Fix the shadow stack using the previously saved SSP pointer.
36764/// \sa emitSetJmpShadowStackFix
36765/// \param [in] MI The temporary Machine Instruction for the builtin.
36766/// \param [in] MBB The Machine Basic Block that will be modified.
36767/// \return The sink MBB that will perform the future indirect branch.
36769X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
36770 MachineBasicBlock *MBB) const {
36771 const MIMetadata MIMD(MI);
36772 MachineFunction *MF = MBB->getParent();
36773 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36775
36776 // Memory Reference
36777 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36778
36779 MVT PVT = getPointerTy(MF->getDataLayout());
36780 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36781
36782 // checkSspMBB:
36783 // xor vreg1, vreg1
36784 // rdssp vreg1
36785 // test vreg1, vreg1
36786 // je sinkMBB # Jump if Shadow Stack is not supported
36787 // fallMBB:
36788 // mov buf+24/12(%rip), vreg2
36789 // sub vreg1, vreg2
36790 // jbe sinkMBB # No need to fix the Shadow Stack
36791 // fixShadowMBB:
36792 // shr 3/2, vreg2
36793 // incssp vreg2 # fix the SSP according to the lower 8 bits
36794 // shr 8, vreg2
36795 // je sinkMBB
36796 // fixShadowLoopPrepareMBB:
36797 // shl vreg2
36798 // mov 128, vreg3
36799 // fixShadowLoopMBB:
36800 // incssp vreg3
36801 // dec vreg2
36802 // jne fixShadowLoopMBB # Iterate until you finish fixing
36803 // # the Shadow Stack
36804 // sinkMBB:
36805
36807 const BasicBlock *BB = MBB->getBasicBlock();
36808
36809 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
36810 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36811 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
36812 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
36813 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
36814 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36815 MF->insert(I, checkSspMBB);
36816 MF->insert(I, fallMBB);
36817 MF->insert(I, fixShadowMBB);
36818 MF->insert(I, fixShadowLoopPrepareMBB);
36819 MF->insert(I, fixShadowLoopMBB);
36820 MF->insert(I, sinkMBB);
36821
36822 // Transfer the remainder of BB and its successor edges to sinkMBB.
36823 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
36824 MBB->end());
36826
36827 MBB->addSuccessor(checkSspMBB);
36828
36829 // Initialize a register with zero.
36830 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
36831 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
36832
36833 if (PVT == MVT::i64) {
36834 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
36835 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
36836 .addImm(0)
36837 .addReg(ZReg)
36838 .addImm(X86::sub_32bit);
36839 ZReg = TmpZReg;
36840 }
36841
36842 // Read the current SSP Register value to the zeroed register.
36843 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36844 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36845 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36846
36847 // Check whether the result of the SSP register is zero and jump directly
36848 // to the sink.
36849 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
36850 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
36851 .addReg(SSPCopyReg)
36852 .addReg(SSPCopyReg);
36853 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
36854 .addMBB(sinkMBB)
36856 checkSspMBB->addSuccessor(sinkMBB);
36857 checkSspMBB->addSuccessor(fallMBB);
36858
36859 // Reload the previously saved SSP register value.
36860 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
36861 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36862 const int64_t SPPOffset = 3 * PVT.getStoreSize();
36864 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
36865 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36866 const MachineOperand &MO = MI.getOperand(i);
36867 if (i == X86::AddrDisp)
36868 MIB.addDisp(MO, SPPOffset);
36869 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36870 // preserve kill flags.
36871 MIB.addReg(MO.getReg());
36872 else
36873 MIB.add(MO);
36874 }
36875 MIB.setMemRefs(MMOs);
36876
36877 // Subtract the current SSP from the previous SSP.
36878 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
36879 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
36880 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
36881 .addReg(PrevSSPReg)
36882 .addReg(SSPCopyReg);
36883
36884 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
36885 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
36886 .addMBB(sinkMBB)
36888 fallMBB->addSuccessor(sinkMBB);
36889 fallMBB->addSuccessor(fixShadowMBB);
36890
36891 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
36892 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
36893 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
36894 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
36895 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
36896 .addReg(SspSubReg)
36897 .addImm(Offset);
36898
36899 // Increase SSP when looking only on the lower 8 bits of the delta.
36900 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
36901 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
36902
36903 // Reset the lower 8 bits.
36904 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
36905 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
36906 .addReg(SspFirstShrReg)
36907 .addImm(8);
36908
36909 // Jump if the result of the shift is zero.
36910 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
36911 .addMBB(sinkMBB)
36913 fixShadowMBB->addSuccessor(sinkMBB);
36914 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
36915
36916 // Do a single shift left.
36917 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
36918 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
36919 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
36920 .addReg(SspSecondShrReg)
36921 .addImm(1);
36922
36923 // Save the value 128 to a register (will be used next with incssp).
36924 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
36925 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
36926 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
36927 .addImm(128);
36928 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
36929
36930 // Since incssp only looks at the lower 8 bits, we might need to do several
36931 // iterations of incssp until we finish fixing the shadow stack.
36932 Register DecReg = MRI.createVirtualRegister(PtrRC);
36933 Register CounterReg = MRI.createVirtualRegister(PtrRC);
36934 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
36935 .addReg(SspAfterShlReg)
36936 .addMBB(fixShadowLoopPrepareMBB)
36937 .addReg(DecReg)
36938 .addMBB(fixShadowLoopMBB);
36939
36940 // Every iteration we increase the SSP by 128.
36941 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
36942
36943 // Every iteration we decrement the counter by 1.
36944 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
36945 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
36946
36947 // Jump if the counter is not zero yet.
36948 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
36949 .addMBB(fixShadowLoopMBB)
36951 fixShadowLoopMBB->addSuccessor(sinkMBB);
36952 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
36953
36954 return sinkMBB;
36955}
36956
36958X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
36959 MachineBasicBlock *MBB) const {
36960 const MIMetadata MIMD(MI);
36961 MachineFunction *MF = MBB->getParent();
36962 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36964
36965 // Memory Reference
36966 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36967
36968 MVT PVT = getPointerTy(MF->getDataLayout());
36969 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36970 "Invalid Pointer Size!");
36971
36972 const TargetRegisterClass *RC =
36973 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36974 Register Tmp = MRI.createVirtualRegister(RC);
36975 // Since FP is only updated here but NOT referenced, it's treated as GPR.
36976 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36977 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
36978 Register SP = RegInfo->getStackRegister();
36979
36981
36982 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36983 const int64_t SPOffset = 2 * PVT.getStoreSize();
36984
36985 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36986 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
36987
36988 MachineBasicBlock *thisMBB = MBB;
36989
36990 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
36991 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
36992 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
36993 }
36994
36995 // Reload FP
36996 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
36997 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36998 const MachineOperand &MO = MI.getOperand(i);
36999 if (MO.isReg()) // Don't add the whole operand, we don't want to
37000 // preserve kill flags.
37001 MIB.addReg(MO.getReg());
37002 else
37003 MIB.add(MO);
37004 }
37005 MIB.setMemRefs(MMOs);
37007
37008 // Reload IP
37009 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37010 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37011 const MachineOperand &MO = MI.getOperand(i);
37012 if (i == X86::AddrDisp)
37013 MIB.addDisp(MO, LabelOffset);
37014 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37015 // preserve kill flags.
37016 MIB.addReg(MO.getReg());
37017 else
37018 MIB.add(MO);
37019 }
37020 MIB.setMemRefs(MMOs);
37021
37022 // Reload SP
37023 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37024 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37025 if (i == X86::AddrDisp)
37026 MIB.addDisp(MI.getOperand(i), SPOffset);
37027 else
37028 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37029 // the last instruction of the expansion.
37030 }
37031 MIB.setMemRefs(MMOs);
37033
37034 // Jump
37035 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37036
37037 MI.eraseFromParent();
37038 return thisMBB;
37039}
37040
37041void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37043 MachineBasicBlock *DispatchBB,
37044 int FI) const {
37045 const MIMetadata MIMD(MI);
37046 MachineFunction *MF = MBB->getParent();
37048 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37049
37050 MVT PVT = getPointerTy(MF->getDataLayout());
37051 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37052
37053 unsigned Op = 0;
37054 unsigned VR = 0;
37055
37056 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37058
37059 if (UseImmLabel) {
37060 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37061 } else {
37062 const TargetRegisterClass *TRC =
37063 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37064 VR = MRI->createVirtualRegister(TRC);
37065 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37066
37067 if (Subtarget.is64Bit())
37068 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37069 .addReg(X86::RIP)
37070 .addImm(1)
37071 .addReg(0)
37072 .addMBB(DispatchBB)
37073 .addReg(0);
37074 else
37075 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37076 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37077 .addImm(1)
37078 .addReg(0)
37079 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37080 .addReg(0);
37081 }
37082
37083 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37084 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37085 if (UseImmLabel)
37086 MIB.addMBB(DispatchBB);
37087 else
37088 MIB.addReg(VR);
37089}
37090
37092X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37093 MachineBasicBlock *BB) const {
37094 const MIMetadata MIMD(MI);
37095 MachineFunction *MF = BB->getParent();
37097 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37098 int FI = MF->getFrameInfo().getFunctionContextIndex();
37099
37100 // Get a mapping of the call site numbers to all of the landing pads they're
37101 // associated with.
37103 unsigned MaxCSNum = 0;
37104 for (auto &MBB : *MF) {
37105 if (!MBB.isEHPad())
37106 continue;
37107
37108 MCSymbol *Sym = nullptr;
37109 for (const auto &MI : MBB) {
37110 if (MI.isDebugInstr())
37111 continue;
37112
37113 assert(MI.isEHLabel() && "expected EH_LABEL");
37114 Sym = MI.getOperand(0).getMCSymbol();
37115 break;
37116 }
37117
37118 if (!MF->hasCallSiteLandingPad(Sym))
37119 continue;
37120
37121 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37122 CallSiteNumToLPad[CSI].push_back(&MBB);
37123 MaxCSNum = std::max(MaxCSNum, CSI);
37124 }
37125 }
37126
37127 // Get an ordered list of the machine basic blocks for the jump table.
37128 std::vector<MachineBasicBlock *> LPadList;
37130 LPadList.reserve(CallSiteNumToLPad.size());
37131
37132 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37133 for (auto &LP : CallSiteNumToLPad[CSI]) {
37134 LPadList.push_back(LP);
37135 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
37136 }
37137 }
37138
37139 assert(!LPadList.empty() &&
37140 "No landing pad destinations for the dispatch jump table!");
37141
37142 // Create the MBBs for the dispatch code.
37143
37144 // Shove the dispatch's address into the return slot in the function context.
37145 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37146 DispatchBB->setIsEHPad(true);
37147
37148 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37149 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37150 DispatchBB->addSuccessor(TrapBB);
37151
37152 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37153 DispatchBB->addSuccessor(DispContBB);
37154
37155 // Insert MBBs.
37156 MF->push_back(DispatchBB);
37157 MF->push_back(DispContBB);
37158 MF->push_back(TrapBB);
37159
37160 // Insert code into the entry block that creates and registers the function
37161 // context.
37162 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37163
37164 // Create the jump table and associated information
37165 unsigned JTE = getJumpTableEncoding();
37166 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37167 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37168
37169 const X86RegisterInfo &RI = TII->getRegisterInfo();
37170 // Add a register mask with no preserved registers. This results in all
37171 // registers being marked as clobbered.
37172 if (RI.hasBasePointer(*MF)) {
37173 const bool FPIs64Bit =
37174 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37175 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37176 MFI->setRestoreBasePointer(MF);
37177
37178 Register FP = RI.getFrameRegister(*MF);
37179 Register BP = RI.getBaseRegister();
37180 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37181 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37184 } else {
37185 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37187 }
37188
37189 // IReg is used as an index in a memory operand and therefore can't be SP
37190 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37191 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37192 Subtarget.is64Bit() ? 8 : 4);
37193 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37194 .addReg(IReg)
37195 .addImm(LPadList.size());
37196 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37197 .addMBB(TrapBB)
37199
37200 if (Subtarget.is64Bit()) {
37201 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37202 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37203
37204 // leaq .LJTI0_0(%rip), BReg
37205 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37206 .addReg(X86::RIP)
37207 .addImm(1)
37208 .addReg(0)
37209 .addJumpTableIndex(MJTI)
37210 .addReg(0);
37211 // movzx IReg64, IReg
37212 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37213 .addImm(0)
37214 .addReg(IReg)
37215 .addImm(X86::sub_32bit);
37216
37217 switch (JTE) {
37219 // jmpq *(BReg,IReg64,8)
37220 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37221 .addReg(BReg)
37222 .addImm(8)
37223 .addReg(IReg64)
37224 .addImm(0)
37225 .addReg(0);
37226 break;
37228 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37229 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37230 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37231
37232 // movl (BReg,IReg64,4), OReg
37233 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37234 .addReg(BReg)
37235 .addImm(4)
37236 .addReg(IReg64)
37237 .addImm(0)
37238 .addReg(0);
37239 // movsx OReg64, OReg
37240 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37241 .addReg(OReg);
37242 // addq BReg, OReg64, TReg
37243 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37244 .addReg(OReg64)
37245 .addReg(BReg);
37246 // jmpq *TReg
37247 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37248 break;
37249 }
37250 default:
37251 llvm_unreachable("Unexpected jump table encoding");
37252 }
37253 } else {
37254 // jmpl *.LJTI0_0(,IReg,4)
37255 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37256 .addReg(0)
37257 .addImm(4)
37258 .addReg(IReg)
37259 .addJumpTableIndex(MJTI)
37260 .addReg(0);
37261 }
37262
37263 // Add the jump table entries as successors to the MBB.
37265 for (auto &LP : LPadList)
37266 if (SeenMBBs.insert(LP).second)
37267 DispContBB->addSuccessor(LP);
37268
37269 // N.B. the order the invoke BBs are processed in doesn't matter here.
37271 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37272 for (MachineBasicBlock *MBB : InvokeBBs) {
37273 // Remove the landing pad successor from the invoke block and replace it
37274 // with the new dispatch block.
37275 // Keep a copy of Successors since it's modified inside the loop.
37277 MBB->succ_rend());
37278 // FIXME: Avoid quadratic complexity.
37279 for (auto *MBBS : Successors) {
37280 if (MBBS->isEHPad()) {
37281 MBB->removeSuccessor(MBBS);
37282 MBBLPads.push_back(MBBS);
37283 }
37284 }
37285
37286 MBB->addSuccessor(DispatchBB);
37287
37288 // Find the invoke call and mark all of the callee-saved registers as
37289 // 'implicit defined' so that they're spilled. This prevents code from
37290 // moving instructions to before the EH block, where they will never be
37291 // executed.
37292 for (auto &II : reverse(*MBB)) {
37293 if (!II.isCall())
37294 continue;
37295
37297 for (auto &MOp : II.operands())
37298 if (MOp.isReg())
37299 DefRegs[MOp.getReg()] = true;
37300
37301 MachineInstrBuilder MIB(*MF, &II);
37302 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37303 unsigned Reg = SavedRegs[RegIdx];
37304 if (!DefRegs[Reg])
37306 }
37307
37308 break;
37309 }
37310 }
37311
37312 // Mark all former landing pads as non-landing pads. The dispatch is the only
37313 // landing pad now.
37314 for (auto &LP : MBBLPads)
37315 LP->setIsEHPad(false);
37316
37317 // The instruction is gone now.
37318 MI.eraseFromParent();
37319 return BB;
37320}
37321
37323X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37324 MachineBasicBlock *BB) const {
37325 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37326 // calls may require proper stack alignment.
37327 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37328 const MIMetadata MIMD(MI);
37329 MachineFunction &MF = *BB->getParent();
37330
37331 // Emit CALLSEQ_START right before the instruction.
37332 MF.getFrameInfo().setAdjustsStack(true);
37333 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37334 MachineInstrBuilder CallseqStart =
37335 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37336 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37337
37338 // Emit CALLSEQ_END right after the instruction.
37339 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37340 MachineInstrBuilder CallseqEnd =
37341 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37342 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37343
37344 return BB;
37345}
37346
37349 MachineBasicBlock *BB) const {
37350 MachineFunction *MF = BB->getParent();
37351 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37352 const MIMetadata MIMD(MI);
37353
37354 auto TMMImmToTMMReg = [](unsigned Imm) {
37355 assert (Imm < 8 && "Illegal tmm index");
37356 return X86::TMM0 + Imm;
37357 };
37358 auto TMMImmToTMMPair = [](unsigned Imm) {
37359 assert(Imm < 8 && "Illegal tmm pair index.");
37360 return X86::TMM0_TMM1 + Imm / 2;
37361 };
37362 switch (MI.getOpcode()) {
37363 default:
37364 llvm_unreachable("Unexpected instr type to insert");
37365 case X86::INDIRECT_THUNK_CALL32:
37366 case X86::INDIRECT_THUNK_CALL64:
37367 case X86::INDIRECT_THUNK_TCRETURN32:
37368 case X86::INDIRECT_THUNK_TCRETURN64:
37369 return EmitLoweredIndirectThunk(MI, BB);
37370 case X86::CATCHRET:
37371 return EmitLoweredCatchRet(MI, BB);
37372 case X86::SEG_ALLOCA_32:
37373 case X86::SEG_ALLOCA_64:
37374 return EmitLoweredSegAlloca(MI, BB);
37375 case X86::PROBED_ALLOCA_32:
37376 case X86::PROBED_ALLOCA_64:
37377 return EmitLoweredProbedAlloca(MI, BB);
37378 case X86::TLSCall_32:
37379 case X86::TLSCall_64:
37380 return EmitLoweredTLSCall(MI, BB);
37381 case X86::CMOV_FR16:
37382 case X86::CMOV_FR16X:
37383 case X86::CMOV_FR32:
37384 case X86::CMOV_FR32X:
37385 case X86::CMOV_FR64:
37386 case X86::CMOV_FR64X:
37387 case X86::CMOV_GR8:
37388 case X86::CMOV_GR16:
37389 case X86::CMOV_GR32:
37390 case X86::CMOV_RFP32:
37391 case X86::CMOV_RFP64:
37392 case X86::CMOV_RFP80:
37393 case X86::CMOV_VR64:
37394 case X86::CMOV_VR128:
37395 case X86::CMOV_VR128X:
37396 case X86::CMOV_VR256:
37397 case X86::CMOV_VR256X:
37398 case X86::CMOV_VR512:
37399 case X86::CMOV_VK1:
37400 case X86::CMOV_VK2:
37401 case X86::CMOV_VK4:
37402 case X86::CMOV_VK8:
37403 case X86::CMOV_VK16:
37404 case X86::CMOV_VK32:
37405 case X86::CMOV_VK64:
37406 return EmitLoweredSelect(MI, BB);
37407
37408 case X86::FP80_ADDr:
37409 case X86::FP80_ADDm32: {
37410 // Change the floating point control register to use double extended
37411 // precision when performing the addition.
37412 int OrigCWFrameIdx =
37413 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37414 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37415 OrigCWFrameIdx);
37416
37417 // Load the old value of the control word...
37418 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37419 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37420 OrigCWFrameIdx);
37421
37422 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37423 // precision.
37424 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37425 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37426 .addReg(OldCW, RegState::Kill)
37427 .addImm(0x300);
37428
37429 // Extract to 16 bits.
37430 Register NewCW16 =
37431 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37432 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37433 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37434
37435 // Prepare memory for FLDCW.
37436 int NewCWFrameIdx =
37437 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37438 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37439 NewCWFrameIdx)
37440 .addReg(NewCW16, RegState::Kill);
37441
37442 // Reload the modified control word now...
37443 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37444 NewCWFrameIdx);
37445
37446 // Do the addition.
37447 if (MI.getOpcode() == X86::FP80_ADDr) {
37448 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37449 .add(MI.getOperand(0))
37450 .add(MI.getOperand(1))
37451 .add(MI.getOperand(2));
37452 } else {
37453 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37454 .add(MI.getOperand(0))
37455 .add(MI.getOperand(1))
37456 .add(MI.getOperand(2))
37457 .add(MI.getOperand(3))
37458 .add(MI.getOperand(4))
37459 .add(MI.getOperand(5))
37460 .add(MI.getOperand(6));
37461 }
37462
37463 // Reload the original control word now.
37464 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37465 OrigCWFrameIdx);
37466
37467 MI.eraseFromParent(); // The pseudo instruction is gone now.
37468 return BB;
37469 }
37470
37471 case X86::FP32_TO_INT16_IN_MEM:
37472 case X86::FP32_TO_INT32_IN_MEM:
37473 case X86::FP32_TO_INT64_IN_MEM:
37474 case X86::FP64_TO_INT16_IN_MEM:
37475 case X86::FP64_TO_INT32_IN_MEM:
37476 case X86::FP64_TO_INT64_IN_MEM:
37477 case X86::FP80_TO_INT16_IN_MEM:
37478 case X86::FP80_TO_INT32_IN_MEM:
37479 case X86::FP80_TO_INT64_IN_MEM: {
37480 // Change the floating point control register to use "round towards zero"
37481 // mode when truncating to an integer value.
37482 int OrigCWFrameIdx =
37483 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37484 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37485 OrigCWFrameIdx);
37486
37487 // Load the old value of the control word...
37488 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37489 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37490 OrigCWFrameIdx);
37491
37492 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37493 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37494 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37495 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37496
37497 // Extract to 16 bits.
37498 Register NewCW16 =
37499 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37500 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37501 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37502
37503 // Prepare memory for FLDCW.
37504 int NewCWFrameIdx =
37505 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37506 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37507 NewCWFrameIdx)
37508 .addReg(NewCW16, RegState::Kill);
37509
37510 // Reload the modified control word now...
37511 addFrameReference(BuildMI(*BB, MI, MIMD,
37512 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37513
37514 // Get the X86 opcode to use.
37515 unsigned Opc;
37516 switch (MI.getOpcode()) {
37517 // clang-format off
37518 default: llvm_unreachable("illegal opcode!");
37519 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37520 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37521 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37522 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37523 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37524 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37525 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37526 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37527 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37528 // clang-format on
37529 }
37530
37532 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37533 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37534
37535 // Reload the original control word now.
37536 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37537 OrigCWFrameIdx);
37538
37539 MI.eraseFromParent(); // The pseudo instruction is gone now.
37540 return BB;
37541 }
37542
37543 // xbegin
37544 case X86::XBEGIN:
37545 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37546
37547 case X86::VAARG_64:
37548 case X86::VAARG_X32:
37549 return EmitVAARGWithCustomInserter(MI, BB);
37550
37551 case X86::EH_SjLj_SetJmp32:
37552 case X86::EH_SjLj_SetJmp64:
37553 return emitEHSjLjSetJmp(MI, BB);
37554
37555 case X86::EH_SjLj_LongJmp32:
37556 case X86::EH_SjLj_LongJmp64:
37557 return emitEHSjLjLongJmp(MI, BB);
37558
37559 case X86::Int_eh_sjlj_setup_dispatch:
37560 return EmitSjLjDispatchBlock(MI, BB);
37561
37562 case TargetOpcode::STATEPOINT:
37563 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37564 // this point in the process. We diverge later.
37565 return emitPatchPoint(MI, BB);
37566
37567 case TargetOpcode::STACKMAP:
37568 case TargetOpcode::PATCHPOINT:
37569 return emitPatchPoint(MI, BB);
37570
37571 case TargetOpcode::PATCHABLE_EVENT_CALL:
37572 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37573 return emitPatchableEventCall(MI, BB);
37574
37575 case X86::LCMPXCHG8B: {
37576 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37577 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37578 // requires a memory operand. If it happens that current architecture is
37579 // i686 and for current function we need a base pointer
37580 // - which is ESI for i686 - register allocator would not be able to
37581 // allocate registers for an address in form of X(%reg, %reg, Y)
37582 // - there never would be enough unreserved registers during regalloc
37583 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37584 // We are giving a hand to register allocator by precomputing the address in
37585 // a new vreg using LEA.
37586
37587 // If it is not i686 or there is no base pointer - nothing to do here.
37588 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37589 return BB;
37590
37591 // Even though this code does not necessarily needs the base pointer to
37592 // be ESI, we check for that. The reason: if this assert fails, there are
37593 // some changes happened in the compiler base pointer handling, which most
37594 // probably have to be addressed somehow here.
37595 assert(TRI->getBaseRegister() == X86::ESI &&
37596 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
37597 "base pointer in mind");
37598
37600 MVT SPTy = getPointerTy(MF->getDataLayout());
37601 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
37602 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
37603
37605 // Regalloc does not need any help when the memory operand of CMPXCHG8B
37606 // does not use index register.
37607 if (AM.IndexReg == X86::NoRegister)
37608 return BB;
37609
37610 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
37611 // four operand definitions that are E[ABCD] registers. We skip them and
37612 // then insert the LEA.
37613 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
37614 while (RMBBI != BB->rend() &&
37615 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
37616 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
37617 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
37618 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
37619 ++RMBBI;
37620 }
37623 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
37624
37625 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
37626
37627 return BB;
37628 }
37629 case X86::LCMPXCHG16B_NO_RBX: {
37630 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37631 Register BasePtr = TRI->getBaseRegister();
37632 if (TRI->hasBasePointer(*MF) &&
37633 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
37634 if (!BB->isLiveIn(BasePtr))
37635 BB->addLiveIn(BasePtr);
37636 // Save RBX into a virtual register.
37637 Register SaveRBX =
37638 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37639 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37640 .addReg(X86::RBX);
37641 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37643 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
37644 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37645 MIB.add(MI.getOperand(Idx));
37646 MIB.add(MI.getOperand(X86::AddrNumOperands));
37647 MIB.addReg(SaveRBX);
37648 } else {
37649 // Simple case, just copy the virtual register to RBX.
37650 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
37651 .add(MI.getOperand(X86::AddrNumOperands));
37653 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
37654 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
37655 MIB.add(MI.getOperand(Idx));
37656 }
37657 MI.eraseFromParent();
37658 return BB;
37659 }
37660 case X86::MWAITX: {
37661 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37662 Register BasePtr = TRI->getBaseRegister();
37663 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
37664 // If no need to save the base pointer, we generate MWAITXrrr,
37665 // else we generate pseudo MWAITX_SAVE_RBX.
37666 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
37667 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37668 .addReg(MI.getOperand(0).getReg());
37669 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37670 .addReg(MI.getOperand(1).getReg());
37671 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
37672 .addReg(MI.getOperand(2).getReg());
37673 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
37674 MI.eraseFromParent();
37675 } else {
37676 if (!BB->isLiveIn(BasePtr)) {
37677 BB->addLiveIn(BasePtr);
37678 }
37679 // Parameters can be copied into ECX and EAX but not EBX yet.
37680 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
37681 .addReg(MI.getOperand(0).getReg());
37682 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
37683 .addReg(MI.getOperand(1).getReg());
37684 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
37685 // Save RBX into a virtual register.
37686 Register SaveRBX =
37687 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37688 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
37689 .addReg(X86::RBX);
37690 // Generate mwaitx pseudo.
37691 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
37692 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
37693 .addDef(Dst) // Destination tied in with SaveRBX.
37694 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
37695 .addUse(SaveRBX); // Save of base pointer.
37696 MI.eraseFromParent();
37697 }
37698 return BB;
37699 }
37700 case TargetOpcode::PREALLOCATED_SETUP: {
37701 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
37702 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37703 MFI->setHasPreallocatedCall(true);
37704 int64_t PreallocatedId = MI.getOperand(0).getImm();
37705 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
37706 assert(StackAdjustment != 0 && "0 stack adjustment");
37707 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
37708 << StackAdjustment << "\n");
37709 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
37710 .addReg(X86::ESP)
37711 .addImm(StackAdjustment);
37712 MI.eraseFromParent();
37713 return BB;
37714 }
37715 case TargetOpcode::PREALLOCATED_ARG: {
37716 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
37717 int64_t PreallocatedId = MI.getOperand(1).getImm();
37718 int64_t ArgIdx = MI.getOperand(2).getImm();
37719 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37720 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
37721 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
37722 << ", arg offset " << ArgOffset << "\n");
37723 // stack pointer + offset
37724 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
37725 MI.getOperand(0).getReg()),
37726 X86::ESP, false, ArgOffset);
37727 MI.eraseFromParent();
37728 return BB;
37729 }
37730 case X86::PTDPBSSD:
37731 case X86::PTDPBSUD:
37732 case X86::PTDPBUSD:
37733 case X86::PTDPBUUD:
37734 case X86::PTDPBF16PS:
37735 case X86::PTDPFP16PS:
37736 case X86::PTCMMIMFP16PS:
37737 case X86::PTCMMRLFP16PS:
37738 case X86::PTDPBF8PS:
37739 case X86::PTDPBHF8PS:
37740 case X86::PTDPHBF8PS:
37741 case X86::PTDPHF8PS:
37742 case X86::PTTDPBF16PS:
37743 case X86::PTTDPFP16PS:
37744 case X86::PTTCMMIMFP16PS:
37745 case X86::PTTCMMRLFP16PS:
37746 case X86::PTCONJTCMMIMFP16PS:
37747 case X86::PTMMULTF32PS:
37748 case X86::PTTMMULTF32PS: {
37749 unsigned Opc;
37750 switch (MI.getOpcode()) {
37751 default: llvm_unreachable("illegal opcode!");
37752 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
37753 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
37754 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
37755 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
37756 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
37757 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
37758 case X86::PTCMMIMFP16PS:
37759 Opc = X86::TCMMIMFP16PS;
37760 break;
37761 case X86::PTCMMRLFP16PS:
37762 Opc = X86::TCMMRLFP16PS;
37763 break;
37764 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
37765 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
37766 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
37767 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
37768 case X86::PTTDPBF16PS:
37769 Opc = X86::TTDPBF16PS;
37770 break;
37771 case X86::PTTDPFP16PS:
37772 Opc = X86::TTDPFP16PS;
37773 break;
37774 case X86::PTTCMMIMFP16PS:
37775 Opc = X86::TTCMMIMFP16PS;
37776 break;
37777 case X86::PTTCMMRLFP16PS:
37778 Opc = X86::TTCMMRLFP16PS;
37779 break;
37780 case X86::PTCONJTCMMIMFP16PS:
37781 Opc = X86::TCONJTCMMIMFP16PS;
37782 break;
37783 case X86::PTMMULTF32PS:
37784 Opc = X86::TMMULTF32PS;
37785 break;
37786 case X86::PTTMMULTF32PS:
37787 Opc = X86::TTMMULTF32PS;
37788 break;
37789 }
37790
37791 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37792 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37793 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
37794 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37795 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
37796
37797 MI.eraseFromParent(); // The pseudo is gone now.
37798 return BB;
37799 }
37800 case X86::PTILEZERO: {
37801 unsigned Imm = MI.getOperand(0).getImm();
37802 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
37803 MI.eraseFromParent(); // The pseudo is gone now.
37804 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37806 return BB;
37807 }
37808 case X86::PTILEZEROV: {
37809 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
37811 return BB;
37812 }
37813 case X86::PTILELOADDRS:
37814 case X86::PTILELOADDRST1:
37815 case X86::PTILELOADD:
37816 case X86::PTILELOADDT1:
37817 case X86::PTILESTORED: {
37818 unsigned Opc;
37819 switch (MI.getOpcode()) {
37820 default: llvm_unreachable("illegal opcode!");
37821#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
37822 case X86::PTILELOADD:
37823 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
37824 break;
37825 case X86::PTILELOADDT1:
37826 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
37827 break;
37828 case X86::PTILESTORED:
37829 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
37830 break;
37831 case X86::PTILELOADDRS:
37832 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
37833 break;
37834 case X86::PTILELOADDRST1:
37835 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
37836 break;
37837 }
37838#undef GET_EGPR_IF_ENABLED
37839
37840 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
37841 unsigned CurOp = 0;
37842 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
37843 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37845
37846 MIB.add(MI.getOperand(CurOp++)); // base
37847 MIB.add(MI.getOperand(CurOp++)); // scale
37848 MIB.add(MI.getOperand(CurOp++)); // index -- stride
37849 MIB.add(MI.getOperand(CurOp++)); // displacement
37850 MIB.add(MI.getOperand(CurOp++)); // segment
37851
37852 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
37853 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
37855
37856 MI.eraseFromParent(); // The pseudo is gone now.
37857 return BB;
37858 }
37859 case X86::PT2RPNTLVWZ0:
37860 case X86::PT2RPNTLVWZ0T1:
37861 case X86::PT2RPNTLVWZ1:
37862 case X86::PT2RPNTLVWZ1T1:
37863 case X86::PT2RPNTLVWZ0RS:
37864 case X86::PT2RPNTLVWZ0RST1:
37865 case X86::PT2RPNTLVWZ1RS:
37866 case X86::PT2RPNTLVWZ1RST1: {
37867 const DebugLoc &DL = MI.getDebugLoc();
37868 unsigned Opc;
37869#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
37870 switch (MI.getOpcode()) {
37871 default:
37872 llvm_unreachable("Unexpected instruction!");
37873 case X86::PT2RPNTLVWZ0:
37874 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
37875 break;
37876 case X86::PT2RPNTLVWZ0T1:
37877 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
37878 break;
37879 case X86::PT2RPNTLVWZ1:
37880 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
37881 break;
37882 case X86::PT2RPNTLVWZ1T1:
37883 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
37884 break;
37885 case X86::PT2RPNTLVWZ0RS:
37886 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
37887 break;
37888 case X86::PT2RPNTLVWZ0RST1:
37889 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
37890 break;
37891 case X86::PT2RPNTLVWZ1RS:
37892 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
37893 break;
37894 case X86::PT2RPNTLVWZ1RST1:
37895 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
37896 break;
37897 }
37898#undef GET_EGPR_IF_ENABLED
37899 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37900 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
37901
37902 MIB.add(MI.getOperand(1)); // base
37903 MIB.add(MI.getOperand(2)); // scale
37904 MIB.add(MI.getOperand(3)); // index
37905 MIB.add(MI.getOperand(4)); // displacement
37906 MIB.add(MI.getOperand(5)); // segment
37907 MI.eraseFromParent(); // The pseudo is gone now.
37908 return BB;
37909 }
37910 case X86::PTTRANSPOSED:
37911 case X86::PTCONJTFP16: {
37912 const DebugLoc &DL = MI.getDebugLoc();
37913 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
37914 : X86::TCONJTFP16;
37915
37916 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37917 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
37918 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37919
37920 MI.eraseFromParent(); // The pseudo is gone now.
37921 return BB;
37922 }
37923 case X86::PTCVTROWPS2BF16Hrri:
37924 case X86::PTCVTROWPS2BF16Lrri:
37925 case X86::PTCVTROWPS2PHHrri:
37926 case X86::PTCVTROWPS2PHLrri:
37927 case X86::PTCVTROWD2PSrri:
37928 case X86::PTILEMOVROWrri: {
37929 const DebugLoc &DL = MI.getDebugLoc();
37930 unsigned Opc;
37931 switch (MI.getOpcode()) {
37932 default:
37933 llvm_unreachable("Unexpected instruction!");
37934 case X86::PTCVTROWD2PSrri:
37935 Opc = X86::TCVTROWD2PSrri;
37936 break;
37937 case X86::PTCVTROWPS2BF16Hrri:
37938 Opc = X86::TCVTROWPS2BF16Hrri;
37939 break;
37940 case X86::PTCVTROWPS2PHHrri:
37941 Opc = X86::TCVTROWPS2PHHrri;
37942 break;
37943 case X86::PTCVTROWPS2BF16Lrri:
37944 Opc = X86::TCVTROWPS2BF16Lrri;
37945 break;
37946 case X86::PTCVTROWPS2PHLrri:
37947 Opc = X86::TCVTROWPS2PHLrri;
37948 break;
37949 case X86::PTILEMOVROWrri:
37950 Opc = X86::TILEMOVROWrri;
37951 break;
37952 }
37953 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37954 MIB.add(MI.getOperand(0));
37955 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37956 MIB.addImm(MI.getOperand(2).getImm());
37957
37958 MI.eraseFromParent(); // The pseudo is gone now.
37959 return BB;
37960 }
37961 case X86::PTCVTROWPS2BF16Hrre:
37962 case X86::PTCVTROWPS2BF16Lrre:
37963 case X86::PTCVTROWPS2PHHrre:
37964 case X86::PTCVTROWPS2PHLrre:
37965 case X86::PTCVTROWD2PSrre:
37966 case X86::PTILEMOVROWrre: {
37967 const DebugLoc &DL = MI.getDebugLoc();
37968 unsigned Opc;
37969 switch (MI.getOpcode()) {
37970 default:
37971 llvm_unreachable("Unexpected instruction!");
37972 case X86::PTCVTROWD2PSrre:
37973 Opc = X86::TCVTROWD2PSrre;
37974 break;
37975 case X86::PTCVTROWPS2BF16Hrre:
37976 Opc = X86::TCVTROWPS2BF16Hrre;
37977 break;
37978 case X86::PTCVTROWPS2BF16Lrre:
37979 Opc = X86::TCVTROWPS2BF16Lrre;
37980 break;
37981 case X86::PTCVTROWPS2PHHrre:
37982 Opc = X86::TCVTROWPS2PHHrre;
37983 break;
37984 case X86::PTCVTROWPS2PHLrre:
37985 Opc = X86::TCVTROWPS2PHLrre;
37986 break;
37987 case X86::PTILEMOVROWrre:
37988 Opc = X86::TILEMOVROWrre;
37989 break;
37990 }
37991 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
37992 MIB.add(MI.getOperand(0));
37993 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
37994 MIB.add(MI.getOperand(2));
37995
37996 MI.eraseFromParent(); // The pseudo is gone now.
37997 return BB;
37998 }
37999 }
38000}
38001
38002//===----------------------------------------------------------------------===//
38003// X86 Optimization Hooks
38004//===----------------------------------------------------------------------===//
38005
38006bool
38008 const APInt &DemandedBits,
38009 const APInt &DemandedElts,
38010 TargetLoweringOpt &TLO) const {
38011 EVT VT = Op.getValueType();
38012 unsigned Opcode = Op.getOpcode();
38013 unsigned EltSize = VT.getScalarSizeInBits();
38014
38015 if (VT.isVector()) {
38016 // If the constant is only all signbits in the active bits, then we should
38017 // extend it to the entire constant to allow it act as a boolean constant
38018 // vector.
38019 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38020 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38021 return false;
38022 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38023 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38024 continue;
38025 const APInt &Val = V.getConstantOperandAPInt(i);
38026 if (Val.getBitWidth() > Val.getNumSignBits() &&
38027 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38028 return true;
38029 }
38030 return false;
38031 };
38032 // For vectors - if we have a constant, then try to sign extend.
38033 // TODO: Handle AND cases.
38034 unsigned ActiveBits = DemandedBits.getActiveBits();
38035 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38036 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38037 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38038 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38039 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38041 SDValue NewC =
38043 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38044 SDValue NewOp =
38045 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38046 return TLO.CombineTo(Op, NewOp);
38047 }
38048 return false;
38049 }
38050
38051 // Only optimize Ands to prevent shrinking a constant that could be
38052 // matched by movzx.
38053 if (Opcode != ISD::AND)
38054 return false;
38055
38056 // Make sure the RHS really is a constant.
38057 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38058 if (!C)
38059 return false;
38060
38061 const APInt &Mask = C->getAPIntValue();
38062
38063 // Clear all non-demanded bits initially.
38064 APInt ShrunkMask = Mask & DemandedBits;
38065
38066 // Find the width of the shrunk mask.
38067 unsigned Width = ShrunkMask.getActiveBits();
38068
38069 // If the mask is all 0s there's nothing to do here.
38070 if (Width == 0)
38071 return false;
38072
38073 // Find the next power of 2 width, rounding up to a byte.
38074 Width = llvm::bit_ceil(std::max(Width, 8U));
38075 // Truncate the width to size to handle illegal types.
38076 Width = std::min(Width, EltSize);
38077
38078 // Calculate a possible zero extend mask for this constant.
38079 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38080
38081 // If we aren't changing the mask, just return true to keep it and prevent
38082 // the caller from optimizing.
38083 if (ZeroExtendMask == Mask)
38084 return true;
38085
38086 // Make sure the new mask can be represented by a combination of mask bits
38087 // and non-demanded bits.
38088 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38089 return false;
38090
38091 // Replace the constant with the zero extend mask.
38092 SDLoc DL(Op);
38093 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38094 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38095 return TLO.CombineTo(Op, NewOp);
38096}
38097
38099 KnownBits &Known,
38100 const APInt &DemandedElts,
38101 const SelectionDAG &DAG, unsigned Depth) {
38102 KnownBits Known2;
38103 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38104 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38105 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38106 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38107 Known = KnownBits::abdu(Known, Known2).zext(16);
38108 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38109 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38110 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38111 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38112 Known = Known.zext(64);
38113}
38114
38116 KnownBits &Known,
38117 const APInt &DemandedElts,
38118 const SelectionDAG &DAG,
38119 unsigned Depth) {
38120 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38121
38122 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38123 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38124 APInt DemandedLoElts =
38125 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38126 APInt DemandedHiElts =
38127 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38128 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38129 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38130 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38131 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38132 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38133 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38134 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38135}
38136
38138 KnownBits &Known,
38139 const APInt &DemandedElts,
38140 const SelectionDAG &DAG,
38141 unsigned Depth) {
38142 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38143
38144 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38145 // pairs.
38146 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38147 APInt DemandedLoElts =
38148 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38149 APInt DemandedHiElts =
38150 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38151 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38152 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38153 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38154 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38155 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38156 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38157 Known = KnownBits::sadd_sat(Lo, Hi);
38158}
38159
38161 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38162 const SelectionDAG &DAG,
38163 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38164 KnownBitsFunc) {
38165 APInt DemandedEltsLHS, DemandedEltsRHS;
38166 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38167 DemandedElts, DemandedEltsLHS,
38168 DemandedEltsRHS);
38169
38170 const auto ComputeForSingleOpFunc =
38171 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38172 return KnownBitsFunc(
38173 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38174 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38175 };
38176
38177 if (DemandedEltsRHS.isZero())
38178 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38179 if (DemandedEltsLHS.isZero())
38180 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38181
38182 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38183 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38184}
38185
38187 KnownBits &Known,
38188 const APInt &DemandedElts,
38189 const SelectionDAG &DAG,
38190 unsigned Depth) const {
38191 unsigned BitWidth = Known.getBitWidth();
38192 unsigned NumElts = DemandedElts.getBitWidth();
38193 unsigned Opc = Op.getOpcode();
38194 EVT VT = Op.getValueType();
38195 assert((Opc >= ISD::BUILTIN_OP_END ||
38196 Opc == ISD::INTRINSIC_WO_CHAIN ||
38197 Opc == ISD::INTRINSIC_W_CHAIN ||
38198 Opc == ISD::INTRINSIC_VOID) &&
38199 "Should use MaskedValueIsZero if you don't know whether Op"
38200 " is a target node!");
38201
38202 Known.resetAll();
38203 switch (Opc) {
38204 default: break;
38205 case X86ISD::MUL_IMM: {
38206 KnownBits Known2;
38207 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38208 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38209 Known = KnownBits::mul(Known, Known2);
38210 break;
38211 }
38212 case X86ISD::BSF: {
38214
38215 KnownBits Known2;
38216 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38217 if (Known2.isNonZero()) {
38218 // If we have a known 1, its position is our upper bound.
38219 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38220 unsigned LowBits = llvm::bit_width(PossibleTZ);
38221 Known.Zero.setBitsFrom(LowBits);
38222 } else if (!Op.getOperand(0).isUndef()) {
38223 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38224 Known = Known.intersectWith(Known2);
38225 }
38226 break;
38227 }
38228 case X86ISD::BSR: {
38229 // TODO: Bound with input known bits?
38231
38232 if (!Op.getOperand(0).isUndef() &&
38233 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38234 KnownBits Known2;
38235 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38236 Known = Known.intersectWith(Known2);
38237 }
38238 break;
38239 }
38240 case X86ISD::SETCC:
38241 Known.Zero.setBitsFrom(1);
38242 break;
38243 case X86ISD::MOVMSK: {
38244 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38245 Known.Zero.setBitsFrom(NumLoBits);
38246 break;
38247 }
38248 case X86ISD::PEXTRB:
38249 case X86ISD::PEXTRW: {
38250 SDValue Src = Op.getOperand(0);
38251 EVT SrcVT = Src.getValueType();
38252 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38253 Op.getConstantOperandVal(1));
38254 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38255 Known = Known.anyextOrTrunc(BitWidth);
38256 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38257 break;
38258 }
38259 case X86ISD::VSRAI:
38260 case X86ISD::VSHLI:
38261 case X86ISD::VSRLI: {
38262 unsigned ShAmt = Op.getConstantOperandVal(1);
38263 if (ShAmt >= VT.getScalarSizeInBits()) {
38264 // Out of range logical bit shifts are guaranteed to be zero.
38265 // Out of range arithmetic bit shifts splat the sign bit.
38266 if (Opc != X86ISD::VSRAI) {
38267 Known.setAllZero();
38268 break;
38269 }
38270
38271 ShAmt = VT.getScalarSizeInBits() - 1;
38272 }
38273
38274 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38275 if (Opc == X86ISD::VSHLI) {
38276 Known.Zero <<= ShAmt;
38277 Known.One <<= ShAmt;
38278 // Low bits are known zero.
38279 Known.Zero.setLowBits(ShAmt);
38280 } else if (Opc == X86ISD::VSRLI) {
38281 Known.Zero.lshrInPlace(ShAmt);
38282 Known.One.lshrInPlace(ShAmt);
38283 // High bits are known zero.
38284 Known.Zero.setHighBits(ShAmt);
38285 } else {
38286 Known.Zero.ashrInPlace(ShAmt);
38287 Known.One.ashrInPlace(ShAmt);
38288 }
38289 break;
38290 }
38291 case X86ISD::PACKUS: {
38292 // PACKUS is just a truncation if the upper half is zero.
38293 APInt DemandedLHS, DemandedRHS;
38294 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38295
38296 Known.One = APInt::getAllOnes(BitWidth * 2);
38297 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38298
38299 KnownBits Known2;
38300 if (!!DemandedLHS) {
38301 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38302 Known = Known.intersectWith(Known2);
38303 }
38304 if (!!DemandedRHS) {
38305 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38306 Known = Known.intersectWith(Known2);
38307 }
38308
38309 if (Known.countMinLeadingZeros() < BitWidth)
38310 Known.resetAll();
38311 Known = Known.trunc(BitWidth);
38312 break;
38313 }
38314 case X86ISD::PSHUFB: {
38315 SDValue Src = Op.getOperand(0);
38316 SDValue Idx = Op.getOperand(1);
38317
38318 // If the index vector is never negative (MSB is zero), then all elements
38319 // come from the source vector. This is useful for cases where
38320 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38321 // below will handle the more common constant shuffle mask case.
38322 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38323 if (KnownIdx.isNonNegative())
38324 Known = DAG.computeKnownBits(Src, Depth + 1);
38325 break;
38326 }
38327 case X86ISD::VBROADCAST: {
38328 SDValue Src = Op.getOperand(0);
38329 if (!Src.getSimpleValueType().isVector()) {
38330 Known = DAG.computeKnownBits(Src, Depth + 1);
38331 return;
38332 }
38333 break;
38334 }
38335 case X86ISD::AND: {
38336 if (Op.getResNo() == 0) {
38337 KnownBits Known2;
38338 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38339 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38340 Known &= Known2;
38341 }
38342 break;
38343 }
38344 case X86ISD::ANDNP: {
38345 KnownBits Known2;
38346 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38347 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38348
38349 // ANDNP = (~X & Y);
38350 Known.One &= Known2.Zero;
38351 Known.Zero |= Known2.One;
38352 break;
38353 }
38354 case X86ISD::FOR: {
38355 KnownBits Known2;
38356 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38357 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38358
38359 Known |= Known2;
38360 break;
38361 }
38362 case X86ISD::PSADBW: {
38363 SDValue LHS = Op.getOperand(0);
38364 SDValue RHS = Op.getOperand(1);
38365 assert(VT.getScalarType() == MVT::i64 &&
38366 LHS.getValueType() == RHS.getValueType() &&
38367 LHS.getValueType().getScalarType() == MVT::i8 &&
38368 "Unexpected PSADBW types");
38369 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38370 break;
38371 }
38372 case X86ISD::PCMPGT:
38373 case X86ISD::PCMPEQ: {
38374 KnownBits KnownLhs =
38375 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38376 KnownBits KnownRhs =
38377 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38378 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38379 ? KnownBits::eq(KnownLhs, KnownRhs)
38380 : KnownBits::sgt(KnownLhs, KnownRhs);
38381 if (Res) {
38382 if (*Res)
38383 Known.setAllOnes();
38384 else
38385 Known.setAllZero();
38386 }
38387 break;
38388 }
38389 case X86ISD::VPMADDWD: {
38390 SDValue LHS = Op.getOperand(0);
38391 SDValue RHS = Op.getOperand(1);
38392 assert(VT.getVectorElementType() == MVT::i32 &&
38393 LHS.getValueType() == RHS.getValueType() &&
38394 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38395 "Unexpected PMADDWD types");
38396 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38397 break;
38398 }
38399 case X86ISD::VPMADDUBSW: {
38400 SDValue LHS = Op.getOperand(0);
38401 SDValue RHS = Op.getOperand(1);
38402 assert(VT.getVectorElementType() == MVT::i16 &&
38403 LHS.getValueType() == RHS.getValueType() &&
38404 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38405 "Unexpected PMADDUBSW types");
38406 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38407 break;
38408 }
38409 case X86ISD::PMULUDQ: {
38410 KnownBits Known2;
38411 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38412 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38413
38414 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38415 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38416 Known = KnownBits::mul(Known, Known2);
38417 break;
38418 }
38419 case X86ISD::CMOV: {
38420 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38421 // If we don't know any bits, early out.
38422 if (Known.isUnknown())
38423 break;
38424 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38425
38426 // Only known if known in both the LHS and RHS.
38427 Known = Known.intersectWith(Known2);
38428 break;
38429 }
38430 case X86ISD::BEXTR:
38431 case X86ISD::BEXTRI: {
38432 SDValue Op0 = Op.getOperand(0);
38433 SDValue Op1 = Op.getOperand(1);
38434
38435 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38436 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38437 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38438
38439 // If the length is 0, the result is 0.
38440 if (Length == 0) {
38441 Known.setAllZero();
38442 break;
38443 }
38444
38445 if ((Shift + Length) <= BitWidth) {
38446 Known = DAG.computeKnownBits(Op0, Depth + 1);
38447 Known = Known.extractBits(Length, Shift);
38448 Known = Known.zextOrTrunc(BitWidth);
38449 }
38450 }
38451 break;
38452 }
38453 case X86ISD::PDEP: {
38454 KnownBits Known2;
38455 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38456 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38457 // Zeros are retained from the mask operand. But not ones.
38458 Known.One.clearAllBits();
38459 // The result will have at least as many trailing zeros as the non-mask
38460 // operand since bits can only map to the same or higher bit position.
38461 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38462 break;
38463 }
38464 case X86ISD::PEXT: {
38465 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38466 // The result has as many leading zeros as the number of zeroes in the mask.
38467 unsigned Count = Known.Zero.popcount();
38468 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38469 Known.One.clearAllBits();
38470 break;
38471 }
38472 case X86ISD::VTRUNC:
38473 case X86ISD::VTRUNCS:
38474 case X86ISD::VTRUNCUS:
38475 case X86ISD::CVTSI2P:
38476 case X86ISD::CVTUI2P:
38477 case X86ISD::CVTP2SI:
38478 case X86ISD::CVTP2UI:
38479 case X86ISD::MCVTP2SI:
38480 case X86ISD::MCVTP2UI:
38481 case X86ISD::CVTTP2SI:
38482 case X86ISD::CVTTP2UI:
38483 case X86ISD::MCVTTP2SI:
38484 case X86ISD::MCVTTP2UI:
38485 case X86ISD::MCVTSI2P:
38486 case X86ISD::MCVTUI2P:
38487 case X86ISD::VFPROUND:
38488 case X86ISD::VMFPROUND:
38489 case X86ISD::CVTPS2PH:
38490 case X86ISD::MCVTPS2PH:
38491 case X86ISD::MCVTTP2SIS:
38492 case X86ISD::MCVTTP2UIS: {
38493 // Truncations/Conversions - upper elements are known zero.
38494 EVT SrcVT = Op.getOperand(0).getValueType();
38495 if (SrcVT.isVector()) {
38496 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38497 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38498 Known.setAllZero();
38499 }
38500 break;
38501 }
38508 // Strict Conversions - upper elements are known zero.
38509 EVT SrcVT = Op.getOperand(1).getValueType();
38510 if (SrcVT.isVector()) {
38511 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38512 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38513 Known.setAllZero();
38514 }
38515 break;
38516 }
38517 case X86ISD::MOVQ2DQ: {
38518 // Move from MMX to XMM. Upper half of XMM should be 0.
38519 if (DemandedElts.countr_zero() >= (NumElts / 2))
38520 Known.setAllZero();
38521 break;
38522 }
38524 APInt UndefElts;
38525 SmallVector<APInt, 16> EltBits;
38526 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38527 /*AllowWholeUndefs*/ false,
38528 /*AllowPartialUndefs*/ false)) {
38529 Known.Zero.setAllBits();
38530 Known.One.setAllBits();
38531 for (unsigned I = 0; I != NumElts; ++I) {
38532 if (!DemandedElts[I])
38533 continue;
38534 if (UndefElts[I]) {
38535 Known.resetAll();
38536 break;
38537 }
38538 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38539 Known = Known.intersectWith(Known2);
38540 }
38541 return;
38542 }
38543 break;
38544 }
38545 case X86ISD::HADD:
38546 case X86ISD::HSUB: {
38548 Op, DemandedElts, Depth, DAG,
38549 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38551 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38552 KnownLHS, KnownRHS);
38553 });
38554 break;
38555 }
38557 switch (Op->getConstantOperandVal(0)) {
38558 case Intrinsic::x86_sse2_pmadd_wd:
38559 case Intrinsic::x86_avx2_pmadd_wd:
38560 case Intrinsic::x86_avx512_pmaddw_d_512: {
38561 SDValue LHS = Op.getOperand(1);
38562 SDValue RHS = Op.getOperand(2);
38563 assert(VT.getScalarType() == MVT::i32 &&
38564 LHS.getValueType() == RHS.getValueType() &&
38565 LHS.getValueType().getScalarType() == MVT::i16 &&
38566 "Unexpected PMADDWD types");
38567 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38568 break;
38569 }
38570 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38571 case Intrinsic::x86_avx2_pmadd_ub_sw:
38572 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38573 SDValue LHS = Op.getOperand(1);
38574 SDValue RHS = Op.getOperand(2);
38575 assert(VT.getScalarType() == MVT::i16 &&
38576 LHS.getValueType() == RHS.getValueType() &&
38577 LHS.getValueType().getScalarType() == MVT::i8 &&
38578 "Unexpected PMADDUBSW types");
38579 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38580 break;
38581 }
38582 case Intrinsic::x86_sse2_psad_bw:
38583 case Intrinsic::x86_avx2_psad_bw:
38584 case Intrinsic::x86_avx512_psad_bw_512: {
38585 SDValue LHS = Op.getOperand(1);
38586 SDValue RHS = Op.getOperand(2);
38587 assert(VT.getScalarType() == MVT::i64 &&
38588 LHS.getValueType() == RHS.getValueType() &&
38589 LHS.getValueType().getScalarType() == MVT::i8 &&
38590 "Unexpected PSADBW types");
38591 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38592 break;
38593 }
38594 }
38595 break;
38596 }
38597 }
38598
38599 // Handle target shuffles.
38600 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38601 if (isTargetShuffle(Opc)) {
38604 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
38605 unsigned NumOps = Ops.size();
38606 unsigned NumElts = VT.getVectorNumElements();
38607 if (Mask.size() == NumElts) {
38608 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38609 Known.Zero.setAllBits(); Known.One.setAllBits();
38610 for (unsigned i = 0; i != NumElts; ++i) {
38611 if (!DemandedElts[i])
38612 continue;
38613 int M = Mask[i];
38614 if (M == SM_SentinelUndef) {
38615 // For UNDEF elements, we don't know anything about the common state
38616 // of the shuffle result.
38617 Known.resetAll();
38618 break;
38619 }
38620 if (M == SM_SentinelZero) {
38621 Known.One.clearAllBits();
38622 continue;
38623 }
38624 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
38625 "Shuffle index out of range");
38626
38627 unsigned OpIdx = (unsigned)M / NumElts;
38628 unsigned EltIdx = (unsigned)M % NumElts;
38629 if (Ops[OpIdx].getValueType() != VT) {
38630 // TODO - handle target shuffle ops with different value types.
38631 Known.resetAll();
38632 break;
38633 }
38634 DemandedOps[OpIdx].setBit(EltIdx);
38635 }
38636 // Known bits are the values that are shared by every demanded element.
38637 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
38638 if (!DemandedOps[i])
38639 continue;
38640 KnownBits Known2 =
38641 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
38642 Known = Known.intersectWith(Known2);
38643 }
38644 }
38645 }
38646 }
38647}
38648
38650 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
38651 unsigned Depth) const {
38652 EVT VT = Op.getValueType();
38653 unsigned VTBits = VT.getScalarSizeInBits();
38654 unsigned Opcode = Op.getOpcode();
38655 switch (Opcode) {
38657 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
38658 return VTBits;
38659
38660 case X86ISD::VTRUNC: {
38661 SDValue Src = Op.getOperand(0);
38662 MVT SrcVT = Src.getSimpleValueType();
38663 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
38664 assert(VTBits < NumSrcBits && "Illegal truncation input type");
38665 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38666 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
38667 if (Tmp > (NumSrcBits - VTBits))
38668 return Tmp - (NumSrcBits - VTBits);
38669 return 1;
38670 }
38671
38672 case X86ISD::PACKSS: {
38673 // PACKSS is just a truncation if the sign bits extend to the packed size.
38674 APInt DemandedLHS, DemandedRHS;
38675 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
38676 DemandedRHS);
38677
38678 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
38679 // patterns often used to compact vXi64 allsignbit patterns.
38680 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
38682 if (BC.getOpcode() == X86ISD::PACKSS &&
38683 BC.getScalarValueSizeInBits() == 16 &&
38684 V.getScalarValueSizeInBits() == 32) {
38687 if (BC0.getScalarValueSizeInBits() == 64 &&
38688 BC1.getScalarValueSizeInBits() == 64 &&
38689 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
38690 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
38691 return 32;
38692 }
38693 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
38694 };
38695
38696 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
38697 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
38698 if (!!DemandedLHS)
38699 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
38700 if (!!DemandedRHS)
38701 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
38702 unsigned Tmp = std::min(Tmp0, Tmp1);
38703 if (Tmp > (SrcBits - VTBits))
38704 return Tmp - (SrcBits - VTBits);
38705 return 1;
38706 }
38707
38708 case X86ISD::VBROADCAST: {
38709 SDValue Src = Op.getOperand(0);
38710 if (!Src.getSimpleValueType().isVector())
38711 return DAG.ComputeNumSignBits(Src, Depth + 1);
38712 break;
38713 }
38714
38715 case X86ISD::VSHLI: {
38716 SDValue Src = Op.getOperand(0);
38717 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
38718 if (ShiftVal.uge(VTBits))
38719 return VTBits; // Shifted all bits out --> zero.
38720 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38721 if (ShiftVal.uge(Tmp))
38722 return 1; // Shifted all sign bits out --> unknown.
38723 return Tmp - ShiftVal.getZExtValue();
38724 }
38725
38726 case X86ISD::VSRAI: {
38727 SDValue Src = Op.getOperand(0);
38728 APInt ShiftVal = Op.getConstantOperandAPInt(1);
38729 if (ShiftVal.uge(VTBits - 1))
38730 return VTBits; // Sign splat.
38731 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
38732 ShiftVal += Tmp;
38733 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
38734 }
38735
38736 case X86ISD::FSETCC:
38737 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
38738 if (VT == MVT::f32 || VT == MVT::f64 ||
38739 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
38740 return VTBits;
38741 break;
38742
38743 case X86ISD::PCMPGT:
38744 case X86ISD::PCMPEQ:
38745 case X86ISD::CMPP:
38746 case X86ISD::VPCOM:
38747 case X86ISD::VPCOMU:
38748 // Vector compares return zero/all-bits result values.
38749 return VTBits;
38750
38751 case X86ISD::ANDNP: {
38752 unsigned Tmp0 =
38753 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
38754 if (Tmp0 == 1) return 1; // Early out.
38755 unsigned Tmp1 =
38756 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
38757 return std::min(Tmp0, Tmp1);
38758 }
38759
38760 case X86ISD::CMOV: {
38761 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
38762 if (Tmp0 == 1) return 1; // Early out.
38763 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
38764 return std::min(Tmp0, Tmp1);
38765 }
38766 }
38767
38768 // Handle target shuffles.
38769 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
38770 if (isTargetShuffle(Opcode)) {
38773 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
38774 unsigned NumOps = Ops.size();
38775 unsigned NumElts = VT.getVectorNumElements();
38776 if (Mask.size() == NumElts) {
38777 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
38778 for (unsigned i = 0; i != NumElts; ++i) {
38779 if (!DemandedElts[i])
38780 continue;
38781 int M = Mask[i];
38782 if (M == SM_SentinelUndef) {
38783 // For UNDEF elements, we don't know anything about the common state
38784 // of the shuffle result.
38785 return 1;
38786 } else if (M == SM_SentinelZero) {
38787 // Zero = all sign bits.
38788 continue;
38789 }
38790 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
38791 "Shuffle index out of range");
38792
38793 unsigned OpIdx = (unsigned)M / NumElts;
38794 unsigned EltIdx = (unsigned)M % NumElts;
38795 if (Ops[OpIdx].getValueType() != VT) {
38796 // TODO - handle target shuffle ops with different value types.
38797 return 1;
38798 }
38799 DemandedOps[OpIdx].setBit(EltIdx);
38800 }
38801 unsigned Tmp0 = VTBits;
38802 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
38803 if (!DemandedOps[i])
38804 continue;
38805 unsigned Tmp1 =
38806 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
38807 Tmp0 = std::min(Tmp0, Tmp1);
38808 }
38809 return Tmp0;
38810 }
38811 }
38812 }
38813
38814 // Fallback case.
38815 return 1;
38816}
38817
38819 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
38820 return N->getOperand(0);
38821 return N;
38822}
38823
38824// Helper to look for a normal load that can be narrowed into a vzload with the
38825// specified VT and memory VT. Returns SDValue() on failure.
38827 SelectionDAG &DAG) {
38828 // Can't if the load is volatile or atomic.
38829 if (!LN->isSimple())
38830 return SDValue();
38831
38832 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38833 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38834 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
38835 LN->getPointerInfo(), LN->getOriginalAlign(),
38836 LN->getMemOperand()->getFlags());
38837}
38838
38839// Attempt to match a combined shuffle mask against supported unary shuffle
38840// instructions.
38841// TODO: Investigate sharing more of this with shuffle lowering.
38842static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
38843 bool AllowFloatDomain, bool AllowIntDomain,
38844 SDValue V1, const SelectionDAG &DAG,
38845 const X86Subtarget &Subtarget, unsigned &Shuffle,
38846 MVT &SrcVT, MVT &DstVT) {
38847 unsigned NumMaskElts = Mask.size();
38848 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
38849
38850 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
38851 if (Mask[0] == 0 &&
38852 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
38853 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
38855 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
38856 Shuffle = X86ISD::VZEXT_MOVL;
38857 if (MaskEltSize == 16)
38858 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38859 else
38860 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38861 return true;
38862 }
38863 }
38864
38865 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
38866 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
38867 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
38868 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
38869 unsigned MaxScale = 64 / MaskEltSize;
38870 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
38871 DAG.ComputeNumSignBits(V1) == MaskEltSize;
38872 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
38873 bool MatchAny = true;
38874 bool MatchZero = true;
38875 bool MatchSign = UseSign;
38876 unsigned NumDstElts = NumMaskElts / Scale;
38877 for (unsigned i = 0;
38878 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
38879 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
38880 MatchAny = MatchSign = MatchZero = false;
38881 break;
38882 }
38883 unsigned Pos = (i * Scale) + 1;
38884 unsigned Len = Scale - 1;
38885 MatchAny &= isUndefInRange(Mask, Pos, Len);
38886 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
38887 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
38888 }
38889 if (MatchAny || MatchSign || MatchZero) {
38890 assert((MatchSign || MatchZero) &&
38891 "Failed to match sext/zext but matched aext?");
38892 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
38893 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
38894 : MVT::getIntegerVT(MaskEltSize);
38895 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
38896
38897 Shuffle = unsigned(
38898 MatchAny ? ISD::ANY_EXTEND
38899 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
38900 if (SrcVT.getVectorNumElements() != NumDstElts)
38901 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
38902
38903 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
38904 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
38905 return true;
38906 }
38907 }
38908 }
38909
38910 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
38911 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
38912 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
38913 isUndefOrEqual(Mask[0], 0) &&
38914 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
38915 Shuffle = X86ISD::VZEXT_MOVL;
38916 if (MaskEltSize == 16)
38917 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
38918 else
38919 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
38920 return true;
38921 }
38922
38923 // Check if we have SSE3 which will let us use MOVDDUP etc. The
38924 // instructions are no slower than UNPCKLPD but has the option to
38925 // fold the input operand into even an unaligned memory load.
38926 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
38927 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
38928 Shuffle = X86ISD::MOVDDUP;
38929 SrcVT = DstVT = MVT::v2f64;
38930 return true;
38931 }
38932 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38933 Shuffle = X86ISD::MOVSLDUP;
38934 SrcVT = DstVT = MVT::v4f32;
38935 return true;
38936 }
38937 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
38938 Shuffle = X86ISD::MOVSHDUP;
38939 SrcVT = DstVT = MVT::v4f32;
38940 return true;
38941 }
38942 }
38943
38944 if (MaskVT.is256BitVector() && AllowFloatDomain) {
38945 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
38946 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
38947 Shuffle = X86ISD::MOVDDUP;
38948 SrcVT = DstVT = MVT::v4f64;
38949 return true;
38950 }
38951 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
38952 V1)) {
38953 Shuffle = X86ISD::MOVSLDUP;
38954 SrcVT = DstVT = MVT::v8f32;
38955 return true;
38956 }
38957 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
38958 V1)) {
38959 Shuffle = X86ISD::MOVSHDUP;
38960 SrcVT = DstVT = MVT::v8f32;
38961 return true;
38962 }
38963 }
38964
38965 if (MaskVT.is512BitVector() && AllowFloatDomain) {
38966 assert(Subtarget.hasAVX512() &&
38967 "AVX512 required for 512-bit vector shuffles");
38968 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
38969 V1)) {
38970 Shuffle = X86ISD::MOVDDUP;
38971 SrcVT = DstVT = MVT::v8f64;
38972 return true;
38973 }
38975 MaskVT, Mask,
38976 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
38977 Shuffle = X86ISD::MOVSLDUP;
38978 SrcVT = DstVT = MVT::v16f32;
38979 return true;
38980 }
38982 MaskVT, Mask,
38983 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
38984 Shuffle = X86ISD::MOVSHDUP;
38985 SrcVT = DstVT = MVT::v16f32;
38986 return true;
38987 }
38988 }
38989
38990 return false;
38991}
38992
38993// Attempt to match a combined shuffle mask against supported unary immediate
38994// permute instructions.
38995// TODO: Investigate sharing more of this with shuffle lowering.
38997 const APInt &Zeroable,
38998 bool AllowFloatDomain, bool AllowIntDomain,
38999 const SelectionDAG &DAG,
39000 const X86Subtarget &Subtarget,
39001 unsigned &Shuffle, MVT &ShuffleVT,
39002 unsigned &PermuteImm) {
39003 unsigned NumMaskElts = Mask.size();
39004 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39005 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39006 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39007 bool ContainsZeros = isAnyZero(Mask);
39008
39009 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39010 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39011 // Check for lane crossing permutes.
39012 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39013 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39014 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39015 Shuffle = X86ISD::VPERMI;
39016 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39017 PermuteImm = getV4X86ShuffleImm(Mask);
39018 return true;
39019 }
39020 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39021 SmallVector<int, 4> RepeatedMask;
39022 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39023 Shuffle = X86ISD::VPERMI;
39024 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39025 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39026 return true;
39027 }
39028 }
39029 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39030 // VPERMILPD can permute with a non-repeating shuffle.
39031 Shuffle = X86ISD::VPERMILPI;
39032 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39033 PermuteImm = 0;
39034 for (int i = 0, e = Mask.size(); i != e; ++i) {
39035 int M = Mask[i];
39036 if (M == SM_SentinelUndef)
39037 continue;
39038 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39039 PermuteImm |= (M & 1) << i;
39040 }
39041 return true;
39042 }
39043 }
39044
39045 // We are checking for shuffle match or shift match. Loop twice so we can
39046 // order which we try and match first depending on target preference.
39047 for (unsigned Order = 0; Order < 2; ++Order) {
39048 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39049 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39050 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39051 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39052 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39053 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39054 SmallVector<int, 4> RepeatedMask;
39055 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39056 // Narrow the repeated mask to create 32-bit element permutes.
39057 SmallVector<int, 4> WordMask = RepeatedMask;
39058 if (MaskScalarSizeInBits == 64)
39059 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39060
39061 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39062 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39063 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39064 PermuteImm = getV4X86ShuffleImm(WordMask);
39065 return true;
39066 }
39067 }
39068
39069 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39070 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39071 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39072 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39073 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39074 SmallVector<int, 4> RepeatedMask;
39075 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39076 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39077 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39078
39079 // PSHUFLW: permute lower 4 elements only.
39080 if (isUndefOrInRange(LoMask, 0, 4) &&
39081 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39082 Shuffle = X86ISD::PSHUFLW;
39083 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39084 PermuteImm = getV4X86ShuffleImm(LoMask);
39085 return true;
39086 }
39087
39088 // PSHUFHW: permute upper 4 elements only.
39089 if (isUndefOrInRange(HiMask, 4, 8) &&
39090 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39091 // Offset the HiMask so that we can create the shuffle immediate.
39092 int OffsetHiMask[4];
39093 for (int i = 0; i != 4; ++i)
39094 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39095
39096 Shuffle = X86ISD::PSHUFHW;
39097 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39098 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39099 return true;
39100 }
39101 }
39102 }
39103 } else {
39104 // Attempt to match against bit rotates.
39105 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39106 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39107 Subtarget.hasAVX512())) {
39108 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39109 Subtarget, Mask);
39110 if (0 < RotateAmt) {
39111 Shuffle = X86ISD::VROTLI;
39112 PermuteImm = (unsigned)RotateAmt;
39113 return true;
39114 }
39115 }
39116 }
39117 // Attempt to match against byte/bit shifts.
39118 if (AllowIntDomain &&
39119 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39120 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39121 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39122 int ShiftAmt =
39123 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39124 Zeroable, Subtarget);
39125 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39126 32 <= ShuffleVT.getScalarSizeInBits())) {
39127 // Byte shifts can be slower so only match them on second attempt.
39128 if (Order == 0 &&
39129 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39130 continue;
39131
39132 PermuteImm = (unsigned)ShiftAmt;
39133 return true;
39134 }
39135
39136 }
39137 }
39138
39139 return false;
39140}
39141
39142// Attempt to match a combined unary shuffle mask against supported binary
39143// shuffle instructions.
39144// TODO: Investigate sharing more of this with shuffle lowering.
39145static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39146 bool AllowFloatDomain, bool AllowIntDomain,
39147 SDValue &V1, SDValue &V2, const SDLoc &DL,
39148 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39149 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39150 bool IsUnary) {
39151 unsigned NumMaskElts = Mask.size();
39152 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39153 unsigned SizeInBits = MaskVT.getSizeInBits();
39154
39155 if (MaskVT.is128BitVector()) {
39156 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39157 AllowFloatDomain) {
39158 V2 = V1;
39159 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39160 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39161 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39162 return true;
39163 }
39164 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39165 AllowFloatDomain) {
39166 V2 = V1;
39167 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39168 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39169 return true;
39170 }
39171 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39172 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39173 std::swap(V1, V2);
39174 Shuffle = X86ISD::MOVSD;
39175 SrcVT = DstVT = MVT::v2f64;
39176 return true;
39177 }
39178 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39179 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39180 Shuffle = X86ISD::MOVSS;
39181 SrcVT = DstVT = MVT::v4f32;
39182 return true;
39183 }
39184 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39185 DAG) &&
39186 Subtarget.hasFP16()) {
39187 Shuffle = X86ISD::MOVSH;
39188 SrcVT = DstVT = MVT::v8f16;
39189 return true;
39190 }
39191 }
39192
39193 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39194 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39195 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39196 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39197 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39198 Subtarget)) {
39199 DstVT = MaskVT;
39200 return true;
39201 }
39202 }
39203 // TODO: Can we handle this inside matchShuffleWithPACK?
39204 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39205 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39206 V1.getScalarValueSizeInBits() == 64 &&
39207 V2.getScalarValueSizeInBits() == 64) {
39208 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39209 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39210 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39211 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39212 SrcVT = MVT::v4i32;
39213 DstVT = MVT::v8i16;
39214 Shuffle = X86ISD::PACKUS;
39215 return true;
39216 }
39217 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39218 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39219 SrcVT = MVT::v8i16;
39220 DstVT = MVT::v16i8;
39221 Shuffle = X86ISD::PACKUS;
39222 return true;
39223 }
39224 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39225 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39226 SrcVT = MVT::v4i32;
39227 DstVT = MVT::v8i16;
39228 Shuffle = X86ISD::PACKSS;
39229 return true;
39230 }
39231 }
39232
39233 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39234 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39235 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39236 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39237 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39238 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39239 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39240 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39241 Subtarget)) {
39242 SrcVT = DstVT = MaskVT;
39243 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39244 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39245 return true;
39246 }
39247 }
39248
39249 // Attempt to match against a OR if we're performing a blend shuffle and the
39250 // non-blended source element is zero in each case.
39251 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39252 if (SizeInBits == V1.getValueSizeInBits() &&
39253 SizeInBits == V2.getValueSizeInBits() &&
39254 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39255 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39256 bool IsBlend = true;
39257 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39258 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39259 unsigned Scale1 = NumV1Elts / NumMaskElts;
39260 unsigned Scale2 = NumV2Elts / NumMaskElts;
39261 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39262 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39263 for (unsigned i = 0; i != NumMaskElts; ++i) {
39264 int M = Mask[i];
39265 if (M == SM_SentinelUndef)
39266 continue;
39267 if (M == SM_SentinelZero) {
39268 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39269 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39270 continue;
39271 }
39272 if (M == (int)i) {
39273 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39274 continue;
39275 }
39276 if (M == (int)(i + NumMaskElts)) {
39277 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39278 continue;
39279 }
39280 IsBlend = false;
39281 break;
39282 }
39283 if (IsBlend) {
39284 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39285 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39286 Shuffle = ISD::OR;
39287 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39288 return true;
39289 }
39290 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39291 // FIXME: handle mismatched sizes?
39292 // TODO: investigate if `ISD::OR` handling in
39293 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39294 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39295 unsigned NumElts = V.getValueType().getVectorNumElements();
39296 KnownBits Known(NumElts);
39297 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39298 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39299 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39300 if (PeepholeKnown.isZero())
39301 Known.Zero.setBit(EltIdx);
39302 if (PeepholeKnown.isAllOnes())
39303 Known.One.setBit(EltIdx);
39304 }
39305 return Known;
39306 };
39307
39308 KnownBits V1Known = computeKnownBitsElementWise(V1);
39309 KnownBits V2Known = computeKnownBitsElementWise(V2);
39310
39311 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39312 int M = Mask[i];
39313 if (M == SM_SentinelUndef)
39314 continue;
39315 if (M == SM_SentinelZero) {
39316 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39317 continue;
39318 }
39319 if (M == (int)i) {
39320 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39321 continue;
39322 }
39323 if (M == (int)(i + NumMaskElts)) {
39324 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39325 continue;
39326 }
39327 llvm_unreachable("will not get here.");
39328 }
39329 if (IsBlend) {
39330 Shuffle = ISD::OR;
39331 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39332 return true;
39333 }
39334 }
39335 }
39336 }
39337
39338 return false;
39339}
39340
39342 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39343 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39344 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39345 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39346 unsigned NumMaskElts = Mask.size();
39347 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39348
39349 // Attempt to match against VALIGND/VALIGNQ rotate.
39350 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39351 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39352 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39353 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39354 if (!isAnyZero(Mask)) {
39355 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39356 if (0 < Rotation) {
39357 Shuffle = X86ISD::VALIGN;
39358 if (EltSizeInBits == 64)
39359 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39360 else
39361 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39362 PermuteImm = Rotation;
39363 return true;
39364 }
39365 }
39366 }
39367
39368 // Attempt to match against PALIGNR byte rotate.
39369 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39370 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39371 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39372 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39373 if (0 < ByteRotation) {
39374 Shuffle = X86ISD::PALIGNR;
39375 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39376 PermuteImm = ByteRotation;
39377 return true;
39378 }
39379 }
39380
39381 // Attempt to combine to X86ISD::BLENDI.
39382 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39383 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39384 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39385 uint64_t BlendMask = 0;
39386 bool ForceV1Zero = false, ForceV2Zero = false;
39387 SmallVector<int, 8> TargetMask(Mask);
39388 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39389 ForceV2Zero, BlendMask)) {
39390 if (MaskVT == MVT::v16i16) {
39391 // We can only use v16i16 PBLENDW if the lanes are repeated.
39392 SmallVector<int, 8> RepeatedMask;
39393 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39394 RepeatedMask)) {
39395 assert(RepeatedMask.size() == 8 &&
39396 "Repeated mask size doesn't match!");
39397 PermuteImm = 0;
39398 for (int i = 0; i < 8; ++i)
39399 if (RepeatedMask[i] >= 8)
39400 PermuteImm |= 1 << i;
39401 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39402 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39403 Shuffle = X86ISD::BLENDI;
39404 ShuffleVT = MaskVT;
39405 return true;
39406 }
39407 } else {
39408 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39409 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39410 PermuteImm = (unsigned)BlendMask;
39411 Shuffle = X86ISD::BLENDI;
39412 ShuffleVT = MaskVT;
39413 return true;
39414 }
39415 }
39416 }
39417
39418 // Attempt to combine to INSERTPS, but only if it has elements that need to
39419 // be set to zero.
39420 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39421 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39422 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39423 Shuffle = X86ISD::INSERTPS;
39424 ShuffleVT = MVT::v4f32;
39425 return true;
39426 }
39427
39428 // Attempt to combine to SHUFPD.
39429 if (AllowFloatDomain && EltSizeInBits == 64 &&
39430 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39431 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39432 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39433 bool ForceV1Zero = false, ForceV2Zero = false;
39434 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39435 PermuteImm, Mask, Zeroable)) {
39436 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39437 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39438 Shuffle = X86ISD::SHUFP;
39439 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39440 return true;
39441 }
39442 }
39443
39444 // Attempt to combine to SHUFPS.
39445 if (AllowFloatDomain && EltSizeInBits == 32 &&
39446 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39447 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39448 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39449 SmallVector<int, 4> RepeatedMask;
39450 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39451 // Match each half of the repeated mask, to determine if its just
39452 // referencing one of the vectors, is zeroable or entirely undef.
39453 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39454 int M0 = RepeatedMask[Offset];
39455 int M1 = RepeatedMask[Offset + 1];
39456
39457 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39458 return DAG.getUNDEF(MaskVT);
39459 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39460 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39461 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39462 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39463 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39464 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39465 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39466 return V1;
39467 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39468 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39469 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39470 return V2;
39471 }
39472
39473 return SDValue();
39474 };
39475
39476 int ShufMask[4] = {-1, -1, -1, -1};
39477 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39478 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39479
39480 if (Lo && Hi) {
39481 V1 = Lo;
39482 V2 = Hi;
39483 Shuffle = X86ISD::SHUFP;
39484 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39485 PermuteImm = getV4X86ShuffleImm(ShufMask);
39486 return true;
39487 }
39488 }
39489 }
39490
39491 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39492 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39493 MaskVT.is128BitVector() &&
39494 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39495 Shuffle = X86ISD::INSERTPS;
39496 ShuffleVT = MVT::v4f32;
39497 return true;
39498 }
39499
39500 return false;
39501}
39502
39504 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39505 bool HasVariableMask, bool AllowVariableCrossLaneMask,
39506 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39507 const X86Subtarget &Subtarget);
39508
39509/// Combine an arbitrary chain of shuffles into a single instruction if
39510/// possible.
39511///
39512/// This is the leaf of the recursive combine below. When we have found some
39513/// chain of single-use x86 shuffle instructions and accumulated the combined
39514/// shuffle mask represented by them, this will try to pattern match that mask
39515/// into either a single instruction if there is a special purpose instruction
39516/// for this operation, or into a PSHUFB instruction which is a fully general
39517/// instruction but should only be used to replace chains over a certain depth.
39519 ArrayRef<int> BaseMask, int Depth,
39520 bool HasVariableMask,
39521 bool AllowVariableCrossLaneMask,
39522 bool AllowVariablePerLaneMask,
39523 SelectionDAG &DAG,
39524 const X86Subtarget &Subtarget) {
39525 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39526 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39527 "Unexpected number of shuffle inputs!");
39528
39529 SDLoc DL(Root);
39530 MVT RootVT = Root.getSimpleValueType();
39531 unsigned RootSizeInBits = RootVT.getSizeInBits();
39532 unsigned NumRootElts = RootVT.getVectorNumElements();
39533
39534 // Canonicalize shuffle input op to the requested type.
39535 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39536 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39537 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39538 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39539 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39540 return DAG.getBitcast(VT, Op);
39541 };
39542
39543 // Find the inputs that enter the chain. Note that multiple uses are OK
39544 // here, we're not going to remove the operands we find.
39545 bool UnaryShuffle = (Inputs.size() == 1);
39546 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39547 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39548 : peekThroughBitcasts(Inputs[1]));
39549
39550 MVT VT1 = V1.getSimpleValueType();
39551 MVT VT2 = V2.getSimpleValueType();
39552 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39553 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39554
39555 SDValue Res;
39556
39557 unsigned NumBaseMaskElts = BaseMask.size();
39558 if (NumBaseMaskElts == 1) {
39559 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39560 return CanonicalizeShuffleInput(RootVT, V1);
39561 }
39562
39563 bool OptForSize = DAG.shouldOptForSize();
39564 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39565 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39566 (RootVT.isFloatingPoint() && Depth >= 1) ||
39567 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39568
39569 // Don't combine if we are a AVX512/EVEX target and the mask element size
39570 // is different from the root element size - this would prevent writemasks
39571 // from being reused.
39572 bool IsMaskedShuffle = false;
39573 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
39574 if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
39575 Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
39576 IsMaskedShuffle = true;
39577 }
39578 }
39579
39580 // If we are shuffling a splat (and not introducing zeros) then we can just
39581 // use it directly. This works for smaller elements as well as they already
39582 // repeat across each mask element.
39583 if (UnaryShuffle && !isAnyZero(BaseMask) &&
39584 V1.getValueSizeInBits() >= RootSizeInBits &&
39585 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39586 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
39587 return CanonicalizeShuffleInput(RootVT, V1);
39588 }
39589
39590 SmallVector<int, 64> Mask(BaseMask);
39591
39592 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
39593 // etc. can be simplified.
39594 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
39595 SmallVector<int> ScaledMask, IdentityMask;
39596 unsigned NumElts = VT1.getVectorNumElements();
39597 if (Mask.size() <= NumElts &&
39598 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
39599 for (unsigned i = 0; i != NumElts; ++i)
39600 IdentityMask.push_back(i);
39601 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
39602 V2))
39603 return CanonicalizeShuffleInput(RootVT, V1);
39604 }
39605 }
39606
39607 // Handle 128/256-bit lane shuffles of 512-bit vectors.
39608 if (RootVT.is512BitVector() &&
39609 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
39610 // If the upper subvectors are zeroable, then an extract+insert is more
39611 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
39612 // to zero the upper subvectors.
39613 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
39614 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39615 return SDValue(); // Nothing to do!
39616 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
39617 "Unexpected lane shuffle");
39618 Res = CanonicalizeShuffleInput(RootVT, V1);
39619 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
39620 bool UseZero = isAnyZero(Mask);
39621 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
39622 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
39623 }
39624
39625 // Narrow shuffle mask to v4x128.
39626 SmallVector<int, 4> ScaledMask;
39627 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
39628 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
39629
39630 // Try to lower to vshuf64x2/vshuf32x4.
39631 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
39632 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
39633 SelectionDAG &DAG) {
39634 int PermMask[4] = {-1, -1, -1, -1};
39635 // Ensure elements came from the same Op.
39636 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
39637 for (int i = 0; i < 4; ++i) {
39638 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
39639 if (ScaledMask[i] < 0)
39640 continue;
39641
39642 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
39643 unsigned OpIndex = i / 2;
39644 if (Ops[OpIndex].isUndef())
39645 Ops[OpIndex] = Op;
39646 else if (Ops[OpIndex] != Op)
39647 return SDValue();
39648
39649 PermMask[i] = ScaledMask[i] % 4;
39650 }
39651
39652 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
39653 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
39654 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
39655 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
39656 };
39657
39658 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
39659 // doesn't work because our mask is for 128 bits and we don't have an MVT
39660 // to match that.
39661 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
39662 isUndefOrInRange(ScaledMask[1], 0, 2) &&
39663 isUndefOrInRange(ScaledMask[2], 2, 4) &&
39664 isUndefOrInRange(ScaledMask[3], 2, 4) &&
39665 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
39666 ScaledMask[0] == (ScaledMask[2] % 2)) &&
39667 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
39668 ScaledMask[1] == (ScaledMask[3] % 2));
39669
39670 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
39671 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39672 return SDValue(); // Nothing to do!
39673 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
39674 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
39675 return DAG.getBitcast(RootVT, V);
39676 }
39677 }
39678
39679 // Handle 128-bit lane shuffles of 256-bit vectors.
39680 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
39681 // If the upper half is zeroable, then an extract+insert is more optimal
39682 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
39683 // zero the upper half.
39684 if (isUndefOrZero(Mask[1])) {
39685 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39686 return SDValue(); // Nothing to do!
39687 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
39688 Res = CanonicalizeShuffleInput(RootVT, V1);
39689 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
39690 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
39691 256);
39692 }
39693
39694 // If we're inserting the low subvector, an insert-subvector 'concat'
39695 // pattern is quicker than VPERM2X128.
39696 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
39697 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
39698 !Subtarget.hasAVX2()) {
39699 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
39700 return SDValue(); // Nothing to do!
39701 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
39702 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
39703 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
39704 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
39705 }
39706
39707 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
39708 return SDValue(); // Nothing to do!
39709
39710 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
39711 // we need to use the zeroing feature.
39712 // Prefer blends for sequential shuffles unless we are optimizing for size.
39713 if (UnaryShuffle &&
39714 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
39715 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
39716 unsigned PermMask = 0;
39717 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
39718 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
39719 return DAG.getNode(
39720 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
39721 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
39722 }
39723
39724 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
39725 return SDValue(); // Nothing to do!
39726
39727 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
39728 if (!UnaryShuffle && !IsMaskedShuffle) {
39729 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
39730 "Unexpected shuffle sentinel value");
39731 // Prefer blends to X86ISD::VPERM2X128.
39732 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
39733 unsigned PermMask = 0;
39734 PermMask |= ((Mask[0] & 3) << 0);
39735 PermMask |= ((Mask[1] & 3) << 4);
39736 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
39737 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
39738 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
39739 CanonicalizeShuffleInput(RootVT, LHS),
39740 CanonicalizeShuffleInput(RootVT, RHS),
39741 DAG.getTargetConstant(PermMask, DL, MVT::i8));
39742 }
39743 }
39744 }
39745
39746 // For masks that have been widened to 128-bit elements or more,
39747 // narrow back down to 64-bit elements.
39748 if (BaseMaskEltSizeInBits > 64) {
39749 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
39750 int MaskScale = BaseMaskEltSizeInBits / 64;
39751 SmallVector<int, 64> ScaledMask;
39752 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39753 Mask = std::move(ScaledMask);
39754 }
39755
39756 // For masked shuffles, we're trying to match the root width for better
39757 // writemask folding, attempt to scale the mask.
39758 // TODO - variable shuffles might need this to be widened again.
39759 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
39760 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
39761 int MaskScale = NumRootElts / Mask.size();
39762 SmallVector<int, 64> ScaledMask;
39763 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
39764 Mask = std::move(ScaledMask);
39765 }
39766
39767 unsigned NumMaskElts = Mask.size();
39768 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
39769 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39770
39771 // Determine the effective mask value type.
39772 FloatDomain &= (32 <= MaskEltSizeInBits);
39773 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
39774 : MVT::getIntegerVT(MaskEltSizeInBits);
39775 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
39776
39777 // Only allow legal mask types.
39778 if (!TLI.isTypeLegal(MaskVT))
39779 return SDValue();
39780
39781 // Attempt to match the mask against known shuffle patterns.
39782 MVT ShuffleSrcVT, ShuffleVT;
39783 unsigned Shuffle, PermuteImm;
39784
39785 // Which shuffle domains are permitted?
39786 // Permit domain crossing at higher combine depths.
39787 // TODO: Should we indicate which domain is preferred if both are allowed?
39788 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
39789 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
39790 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
39791
39792 // Determine zeroable mask elements.
39793 APInt KnownUndef, KnownZero;
39794 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
39795 APInt Zeroable = KnownUndef | KnownZero;
39796
39797 if (UnaryShuffle) {
39798 // Attempt to match against broadcast-from-vector.
39799 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
39800 if ((Subtarget.hasAVX2() ||
39801 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
39802 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
39803 if (isUndefOrEqual(Mask, 0)) {
39804 if (V1.getValueType() == MaskVT &&
39806 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
39807 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39808 return SDValue(); // Nothing to do!
39809 Res = V1.getOperand(0);
39810 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39811 return DAG.getBitcast(RootVT, Res);
39812 }
39813 if (Subtarget.hasAVX2()) {
39814 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
39815 return SDValue(); // Nothing to do!
39816 Res = CanonicalizeShuffleInput(MaskVT, V1);
39817 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
39818 return DAG.getBitcast(RootVT, Res);
39819 }
39820 }
39821 }
39822
39823 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
39824 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
39825 (!IsMaskedShuffle ||
39826 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39827 if (Depth == 0 && Root.getOpcode() == Shuffle)
39828 return SDValue(); // Nothing to do!
39829 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39830 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
39831 return DAG.getBitcast(RootVT, Res);
39832 }
39833
39834 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39835 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
39836 PermuteImm) &&
39837 (!IsMaskedShuffle ||
39838 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39839 if (Depth == 0 && Root.getOpcode() == Shuffle)
39840 return SDValue(); // Nothing to do!
39841 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
39842 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
39843 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39844 return DAG.getBitcast(RootVT, Res);
39845 }
39846 }
39847
39848 // Attempt to combine to INSERTPS, but only if the inserted element has come
39849 // from a scalar.
39850 // TODO: Handle other insertions here as well?
39851 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
39852 Subtarget.hasSSE41() &&
39853 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
39854 if (MaskEltSizeInBits == 32) {
39855 SDValue SrcV1 = V1, SrcV2 = V2;
39856 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
39857 DAG) &&
39858 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39859 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39860 return SDValue(); // Nothing to do!
39861 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39862 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
39863 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
39864 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39865 return DAG.getBitcast(RootVT, Res);
39866 }
39867 }
39868 if (MaskEltSizeInBits == 64 &&
39869 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
39870 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39871 V2.getScalarValueSizeInBits() <= 32) {
39872 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
39873 return SDValue(); // Nothing to do!
39874 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
39875 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
39876 CanonicalizeShuffleInput(MVT::v4f32, V1),
39877 CanonicalizeShuffleInput(MVT::v4f32, V2),
39878 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39879 return DAG.getBitcast(RootVT, Res);
39880 }
39881 }
39882
39883 SDValue NewV1 = V1; // Save operands in case early exit happens.
39884 SDValue NewV2 = V2;
39885 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
39886 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
39887 ShuffleVT, UnaryShuffle) &&
39888 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39889 if (Depth == 0 && Root.getOpcode() == Shuffle)
39890 return SDValue(); // Nothing to do!
39891 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
39892 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
39893 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
39894 return DAG.getBitcast(RootVT, Res);
39895 }
39896
39897 NewV1 = V1; // Save operands in case early exit happens.
39898 NewV2 = V2;
39899 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
39900 AllowIntDomain, NewV1, NewV2, DL, DAG,
39901 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
39902 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
39903 if (Depth == 0 && Root.getOpcode() == Shuffle)
39904 return SDValue(); // Nothing to do!
39905 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
39906 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
39907 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
39908 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
39909 return DAG.getBitcast(RootVT, Res);
39910 }
39911
39912 // Typically from here on, we need an integer version of MaskVT.
39913 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
39914 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
39915
39916 // Annoyingly, SSE4A instructions don't map into the above match helpers.
39917 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
39918 uint64_t BitLen, BitIdx;
39919 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
39920 Zeroable)) {
39921 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
39922 return SDValue(); // Nothing to do!
39923 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39924 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
39925 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39926 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39927 return DAG.getBitcast(RootVT, Res);
39928 }
39929
39930 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
39931 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
39932 return SDValue(); // Nothing to do!
39933 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
39934 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
39935 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
39936 DAG.getTargetConstant(BitLen, DL, MVT::i8),
39937 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
39938 return DAG.getBitcast(RootVT, Res);
39939 }
39940 }
39941
39942 // Match shuffle against TRUNCATE patterns.
39943 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
39944 // Match against a VTRUNC instruction, accounting for src/dst sizes.
39945 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
39946 Subtarget)) {
39947 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
39948 ShuffleSrcVT.getVectorNumElements();
39949 unsigned Opc =
39950 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
39951 if (Depth == 0 && Root.getOpcode() == Opc)
39952 return SDValue(); // Nothing to do!
39953 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39954 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
39955 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
39956 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
39957 return DAG.getBitcast(RootVT, Res);
39958 }
39959
39960 // Do we need a more general binary truncation pattern?
39961 if (RootSizeInBits < 512 &&
39962 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
39963 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
39964 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
39965 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
39966 // Bail if this was already a truncation or PACK node.
39967 // We sometimes fail to match PACK if we demand known undef elements.
39968 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
39969 Root.getOpcode() == X86ISD::PACKSS ||
39970 Root.getOpcode() == X86ISD::PACKUS))
39971 return SDValue(); // Nothing to do!
39972 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39973 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
39974 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
39975 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
39976 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
39977 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
39978 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
39979 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
39980 return DAG.getBitcast(RootVT, Res);
39981 }
39982 }
39983
39984 // Don't try to re-form single instruction chains under any circumstances now
39985 // that we've done encoding canonicalization for them.
39986 if (Depth < 1)
39987 return SDValue();
39988
39989 // Depth threshold above which we can efficiently use variable mask shuffles.
39990 int VariableCrossLaneShuffleDepth =
39991 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
39992 int VariablePerLaneShuffleDepth =
39993 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
39994 AllowVariableCrossLaneMask &=
39995 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
39996 AllowVariablePerLaneMask &=
39997 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
39998 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
39999 // higher depth before combining them.
40000 bool AllowBWIVPERMV3 =
40001 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40002
40003 // If root was a VPERMV3 node, always allow a variable shuffle.
40004 if (Root.getOpcode() == X86ISD::VPERMV3)
40005 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40006
40007 bool MaskContainsZeros = isAnyZero(Mask);
40008
40009 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40010 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40011 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40012 if (Subtarget.hasAVX2() &&
40013 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40014 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40015 Res = CanonicalizeShuffleInput(MaskVT, V1);
40016 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40017 return DAG.getBitcast(RootVT, Res);
40018 }
40019 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40020 if ((Subtarget.hasAVX512() &&
40021 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40022 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40023 (Subtarget.hasBWI() &&
40024 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40025 (Subtarget.hasVBMI() &&
40026 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40027 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40028 V2 = DAG.getUNDEF(MaskVT);
40029 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40030 return DAG.getBitcast(RootVT, Res);
40031 }
40032 }
40033
40034 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40035 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40036 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40037 ((Subtarget.hasAVX512() &&
40038 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40039 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40040 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40041 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40042 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40043 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40044 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40045 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40046 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40047 for (unsigned i = 0; i != NumMaskElts; ++i)
40048 if (Mask[i] == SM_SentinelZero)
40049 Mask[i] = NumMaskElts + i;
40050 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40051 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40052 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40053 return DAG.getBitcast(RootVT, Res);
40054 }
40055
40056 // If that failed and either input is extracted then try to combine as a
40057 // shuffle with the larger type.
40059 Inputs, Root, BaseMask, Depth, HasVariableMask,
40060 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40061 Subtarget))
40062 return WideShuffle;
40063
40064 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40065 // (non-VLX will pad to 512-bit shuffles).
40066 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40067 ((Subtarget.hasAVX512() &&
40068 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40069 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40070 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40071 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40072 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40073 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40074 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40075 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40076 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40077 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40078 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40079 return DAG.getBitcast(RootVT, Res);
40080 }
40081 return SDValue();
40082 }
40083
40084 // See if we can combine a single input shuffle with zeros to a bit-mask,
40085 // which is much simpler than any shuffle.
40086 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40087 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40088 TLI.isTypeLegal(MaskVT)) {
40089 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40090 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40091 APInt UndefElts(NumMaskElts, 0);
40092 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40093 for (unsigned i = 0; i != NumMaskElts; ++i) {
40094 int M = Mask[i];
40095 if (M == SM_SentinelUndef) {
40096 UndefElts.setBit(i);
40097 continue;
40098 }
40099 if (M == SM_SentinelZero)
40100 continue;
40101 EltBits[i] = AllOnes;
40102 }
40103 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40104 Res = CanonicalizeShuffleInput(MaskVT, V1);
40105 unsigned AndOpcode =
40107 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40108 return DAG.getBitcast(RootVT, Res);
40109 }
40110
40111 // If we have a single input shuffle with different shuffle patterns in the
40112 // the 128-bit lanes use the variable mask to VPERMILPS.
40113 // TODO Combine other mask types at higher depths.
40114 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40115 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40116 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40117 SmallVector<SDValue, 16> VPermIdx;
40118 for (int M : Mask) {
40119 SDValue Idx =
40120 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40121 VPermIdx.push_back(Idx);
40122 }
40123 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40124 Res = CanonicalizeShuffleInput(MaskVT, V1);
40125 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40126 return DAG.getBitcast(RootVT, Res);
40127 }
40128
40129 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40130 // to VPERMIL2PD/VPERMIL2PS.
40131 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40132 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40133 MaskVT == MVT::v8f32)) {
40134 // VPERMIL2 Operation.
40135 // Bits[3] - Match Bit.
40136 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40137 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40138 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40139 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40140 SmallVector<int, 8> VPerm2Idx;
40141 unsigned M2ZImm = 0;
40142 for (int M : Mask) {
40143 if (M == SM_SentinelUndef) {
40144 VPerm2Idx.push_back(-1);
40145 continue;
40146 }
40147 if (M == SM_SentinelZero) {
40148 M2ZImm = 2;
40149 VPerm2Idx.push_back(8);
40150 continue;
40151 }
40152 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40153 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40154 VPerm2Idx.push_back(Index);
40155 }
40156 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40157 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40158 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40159 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40160 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40161 return DAG.getBitcast(RootVT, Res);
40162 }
40163
40164 // If we have 3 or more shuffle instructions or a chain involving a variable
40165 // mask, we can replace them with a single PSHUFB instruction profitably.
40166 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40167 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40168 // more aggressive.
40169 if (UnaryShuffle && AllowVariablePerLaneMask &&
40170 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40171 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40172 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40173 SmallVector<SDValue, 16> PSHUFBMask;
40174 int NumBytes = RootVT.getSizeInBits() / 8;
40175 int Ratio = NumBytes / NumMaskElts;
40176 for (int i = 0; i < NumBytes; ++i) {
40177 int M = Mask[i / Ratio];
40178 if (M == SM_SentinelUndef) {
40179 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40180 continue;
40181 }
40182 if (M == SM_SentinelZero) {
40183 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40184 continue;
40185 }
40186 M = Ratio * M + i % Ratio;
40187 assert((M / 16) == (i / 16) && "Lane crossing detected");
40188 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40189 }
40190 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40191 Res = CanonicalizeShuffleInput(ByteVT, V1);
40192 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40193 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40194 return DAG.getBitcast(RootVT, Res);
40195 }
40196
40197 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40198 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40199 // slower than PSHUFB on targets that support both.
40200 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40201 Subtarget.hasXOP()) {
40202 // VPPERM Mask Operation
40203 // Bits[4:0] - Byte Index (0 - 31)
40204 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40205 SmallVector<SDValue, 16> VPPERMMask;
40206 int NumBytes = 16;
40207 int Ratio = NumBytes / NumMaskElts;
40208 for (int i = 0; i < NumBytes; ++i) {
40209 int M = Mask[i / Ratio];
40210 if (M == SM_SentinelUndef) {
40211 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40212 continue;
40213 }
40214 if (M == SM_SentinelZero) {
40215 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40216 continue;
40217 }
40218 M = Ratio * M + i % Ratio;
40219 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40220 }
40221 MVT ByteVT = MVT::v16i8;
40222 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40223 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40224 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40225 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40226 return DAG.getBitcast(RootVT, Res);
40227 }
40228
40229 // If that failed and either input is extracted then try to combine as a
40230 // shuffle with the larger type.
40232 Inputs, Root, BaseMask, Depth, HasVariableMask,
40233 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40234 return WideShuffle;
40235
40236 // If we have a dual input shuffle then lower to VPERMV3,
40237 // (non-VLX will pad to 512-bit shuffles)
40238 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40239 ((Subtarget.hasAVX512() &&
40240 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40241 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40242 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40243 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40244 MaskVT == MVT::v16i32)) ||
40245 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40246 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40247 MaskVT == MVT::v32i16)) ||
40248 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40249 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40250 MaskVT == MVT::v64i8)))) {
40251 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40252 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40253 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40254 return DAG.getBitcast(RootVT, Res);
40255 }
40256
40257 // Failed to find any combines.
40258 return SDValue();
40259}
40260
40261// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40262// instruction if possible.
40263//
40264// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40265// type size to attempt to combine:
40266// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40267// -->
40268// extract_subvector(shuffle(x,y,m2),0)
40270 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40271 bool HasVariableMask, bool AllowVariableCrossLaneMask,
40272 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40273 const X86Subtarget &Subtarget) {
40274 unsigned NumMaskElts = BaseMask.size();
40275 unsigned NumInputs = Inputs.size();
40276 if (NumInputs == 0)
40277 return SDValue();
40278
40279 EVT RootVT = Root.getValueType();
40280 unsigned RootSizeInBits = RootVT.getSizeInBits();
40281 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40282 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40283
40284 // Peek through subvectors to find widest legal vector.
40285 // TODO: Handle ISD::TRUNCATE
40286 unsigned WideSizeInBits = RootSizeInBits;
40287 for (SDValue Input : Inputs) {
40288 Input = peekThroughBitcasts(Input);
40289 while (1) {
40290 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40291 Input = peekThroughBitcasts(Input.getOperand(0));
40292 continue;
40293 }
40294 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40295 Input.getOperand(0).isUndef()) {
40296 Input = peekThroughBitcasts(Input.getOperand(1));
40297 continue;
40298 }
40299 break;
40300 }
40301 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40302 WideSizeInBits < Input.getValueSizeInBits())
40303 WideSizeInBits = Input.getValueSizeInBits();
40304 }
40305
40306 // Bail if we fail to find a source larger than the existing root.
40307 unsigned Scale = WideSizeInBits / RootSizeInBits;
40308 if (WideSizeInBits <= RootSizeInBits ||
40309 (WideSizeInBits % RootSizeInBits) != 0)
40310 return SDValue();
40311
40312 // Create new mask for larger type.
40313 SmallVector<int, 64> WideMask(BaseMask);
40314 for (int &M : WideMask) {
40315 if (M < 0)
40316 continue;
40317 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40318 }
40319 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40320
40321 // Attempt to peek through inputs and adjust mask when we extract from an
40322 // upper subvector.
40323 int AdjustedMasks = 0;
40324 SmallVector<SDValue, 4> WideInputs(Inputs);
40325 for (unsigned I = 0; I != NumInputs; ++I) {
40326 SDValue &Input = WideInputs[I];
40327 Input = peekThroughBitcasts(Input);
40328 while (1) {
40329 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40330 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40332 if (Idx != 0) {
40333 ++AdjustedMasks;
40334 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40335 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40336
40337 int lo = I * WideMask.size();
40338 int hi = (I + 1) * WideMask.size();
40339 for (int &M : WideMask)
40340 if (lo <= M && M < hi)
40341 M += Idx;
40342 }
40343 Input = peekThroughBitcasts(Input.getOperand(0));
40344 continue;
40345 }
40346 // TODO: Handle insertions into upper subvectors.
40347 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40348 Input.getOperand(0).isUndef() &&
40349 isNullConstant(Input.getOperand(2))) {
40350 Input = peekThroughBitcasts(Input.getOperand(1));
40351 continue;
40352 }
40353 break;
40354 }
40355 }
40356
40357 // Remove unused/repeated shuffle source ops.
40358 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40359 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40360
40361 // Bail if we're always extracting from the lowest subvectors,
40362 // combineX86ShuffleChain should match this for the current width, or the
40363 // shuffle still references too many inputs.
40364 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40365 return SDValue();
40366
40367 // Minor canonicalization of the accumulated shuffle mask to make it easier
40368 // to match below. All this does is detect masks with sequential pairs of
40369 // elements, and shrink them to the half-width mask. It does this in a loop
40370 // so it will reduce the size of the mask to the minimal width mask which
40371 // performs an equivalent shuffle.
40372 while (WideMask.size() > 1) {
40373 SmallVector<int, 64> WidenedMask;
40374 if (!canWidenShuffleElements(WideMask, WidenedMask))
40375 break;
40376 WideMask = std::move(WidenedMask);
40377 }
40378
40379 // Canonicalization of binary shuffle masks to improve pattern matching by
40380 // commuting the inputs.
40381 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40383 std::swap(WideInputs[0], WideInputs[1]);
40384 }
40385
40386 // Increase depth for every upper subvector we've peeked through.
40387 Depth += AdjustedMasks;
40388
40389 // Attempt to combine wider chain.
40390 // TODO: Can we use a better Root?
40391 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40392 WideInputs.back().getValueSizeInBits()
40393 ? WideInputs.front()
40394 : WideInputs.back();
40395 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40396 "WideRootSize mismatch");
40397
40398 if (SDValue WideShuffle =
40399 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40400 HasVariableMask, AllowVariableCrossLaneMask,
40401 AllowVariablePerLaneMask, DAG, Subtarget)) {
40402 WideShuffle =
40403 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40404 return DAG.getBitcast(RootVT, WideShuffle);
40405 }
40406
40407 return SDValue();
40408}
40409
40410// Canonicalize the combined shuffle mask chain with horizontal ops.
40411// NOTE: This may update the Ops and Mask.
40414 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40415 const X86Subtarget &Subtarget) {
40416 if (Mask.empty() || Ops.empty())
40417 return SDValue();
40418
40420 for (SDValue Op : Ops)
40422
40423 // All ops must be the same horizop + type.
40424 SDValue BC0 = BC[0];
40425 EVT VT0 = BC0.getValueType();
40426 unsigned Opcode0 = BC0.getOpcode();
40427 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40428 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40429 }))
40430 return SDValue();
40431
40432 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40433 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40434 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40435 if (!isHoriz && !isPack)
40436 return SDValue();
40437
40438 // Do all ops have a single use?
40439 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40440 return Op.hasOneUse() &&
40442 });
40443
40444 int NumElts = VT0.getVectorNumElements();
40445 int NumLanes = VT0.getSizeInBits() / 128;
40446 int NumEltsPerLane = NumElts / NumLanes;
40447 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40448 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40449 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40450
40451 if (NumEltsPerLane >= 4 &&
40452 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40453 SmallVector<int> LaneMask, ScaledMask;
40454 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40455 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40456 // See if we can remove the shuffle by resorting the HOP chain so that
40457 // the HOP args are pre-shuffled.
40458 // TODO: Generalize to any sized/depth chain.
40459 // TODO: Add support for PACKSS/PACKUS.
40460 if (isHoriz) {
40461 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40462 auto GetHOpSrc = [&](int M) {
40463 if (M == SM_SentinelUndef)
40464 return DAG.getUNDEF(VT0);
40465 if (M == SM_SentinelZero)
40466 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40467 SDValue Src0 = BC[M / 4];
40468 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40469 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40470 return Src1.getOperand(M % 2);
40471 return SDValue();
40472 };
40473 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40474 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40475 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40476 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40477 if (M0 && M1 && M2 && M3) {
40478 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40479 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40480 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40481 }
40482 }
40483 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40484 if (Ops.size() >= 2) {
40485 SDValue LHS, RHS;
40486 auto GetHOpSrc = [&](int M, int &OutM) {
40487 // TODO: Support SM_SentinelZero
40488 if (M < 0)
40489 return M == SM_SentinelUndef;
40490 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40491 if (!LHS || LHS == Src) {
40492 LHS = Src;
40493 OutM = (M % 2);
40494 return true;
40495 }
40496 if (!RHS || RHS == Src) {
40497 RHS = Src;
40498 OutM = (M % 2) + 2;
40499 return true;
40500 }
40501 return false;
40502 };
40503 int PostMask[4] = {-1, -1, -1, -1};
40504 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40505 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40506 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40507 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40508 LHS = DAG.getBitcast(SrcVT, LHS);
40509 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40510 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40511 // Use SHUFPS for the permute so this will work on SSE2 targets,
40512 // shuffle combining and domain handling will simplify this later on.
40513 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40514 Res = DAG.getBitcast(ShuffleVT, Res);
40515 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40516 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40517 }
40518 }
40519 }
40520 }
40521
40522 if (2 < Ops.size())
40523 return SDValue();
40524
40525 SDValue BC1 = BC[BC.size() - 1];
40526 if (Mask.size() == VT0.getVectorNumElements()) {
40527 // Canonicalize binary shuffles of horizontal ops that use the
40528 // same sources to an unary shuffle.
40529 // TODO: Try to perform this fold even if the shuffle remains.
40530 if (Ops.size() == 2) {
40531 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40532 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40533 };
40534 // Commute if all BC0's ops are contained in BC1.
40535 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40536 ContainsOps(BC1, BC0.getOperand(1))) {
40538 std::swap(Ops[0], Ops[1]);
40539 std::swap(BC0, BC1);
40540 }
40541
40542 // If BC1 can be represented by BC0, then convert to unary shuffle.
40543 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40544 ContainsOps(BC0, BC1.getOperand(1))) {
40545 for (int &M : Mask) {
40546 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40547 continue;
40548 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40549 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40550 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40551 M += NumHalfEltsPerLane;
40552 }
40553 }
40554 }
40555
40556 // Canonicalize unary horizontal ops to only refer to lower halves.
40557 for (int i = 0; i != NumElts; ++i) {
40558 int &M = Mask[i];
40559 if (isUndefOrZero(M))
40560 continue;
40561 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40562 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40563 M -= NumHalfEltsPerLane;
40564 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40565 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40566 M -= NumHalfEltsPerLane;
40567 }
40568 }
40569
40570 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40571 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40572 // represents the LHS/RHS inputs for the lower/upper halves.
40573 SmallVector<int, 16> TargetMask128, WideMask128;
40574 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40575 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40576 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
40577 bool SingleOp = (Ops.size() == 1);
40578 if (isPack || OneUseOps ||
40579 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40580 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40581 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40582 Lo = Lo.getOperand(WideMask128[0] & 1);
40583 Hi = Hi.getOperand(WideMask128[1] & 1);
40584 if (SingleOp) {
40585 SDValue Undef = DAG.getUNDEF(SrcVT);
40586 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40587 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40588 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40589 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40590 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40591 }
40592 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40593 }
40594 }
40595
40596 // If we are post-shuffling a 256-bit hop and not requiring the upper
40597 // elements, then try to narrow to a 128-bit hop directly.
40598 SmallVector<int, 16> WideMask64;
40599 if (Ops.size() == 1 && NumLanes == 2 &&
40600 scaleShuffleElements(Mask, 4, WideMask64) &&
40601 isUndefInRange(WideMask64, 2, 2)) {
40602 int M0 = WideMask64[0];
40603 int M1 = WideMask64[1];
40604 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
40606 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
40607 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
40608 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
40609 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
40610 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
40611 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
40612 }
40613 }
40614
40615 return SDValue();
40616}
40617
40618// Attempt to constant fold all of the constant source ops.
40619// Returns true if the entire shuffle is folded to a constant.
40620// TODO: Extend this to merge multiple constant Ops and update the mask.
40622 ArrayRef<int> Mask,
40623 bool HasVariableMask,
40624 SelectionDAG &DAG, const SDLoc &DL,
40625 const X86Subtarget &Subtarget) {
40626 unsigned SizeInBits = VT.getSizeInBits();
40627 unsigned NumMaskElts = Mask.size();
40628 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
40629 unsigned NumOps = Ops.size();
40630
40631 // Extract constant bits from each source op.
40632 SmallVector<APInt, 16> UndefEltsOps(NumOps);
40633 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
40634 for (unsigned I = 0; I != NumOps; ++I)
40635 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
40636 RawBitsOps[I],
40637 /*AllowWholeUndefs*/ true,
40638 /*AllowPartialUndefs*/ true))
40639 return SDValue();
40640
40641 // If we're optimizing for size, only fold if at least one of the constants is
40642 // only used once or the combined shuffle has included a variable mask
40643 // shuffle, this is to avoid constant pool bloat.
40644 bool IsOptimizingSize = DAG.shouldOptForSize();
40645 if (IsOptimizingSize && !HasVariableMask &&
40646 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
40647 return SDValue();
40648
40649 // Shuffle the constant bits according to the mask.
40650 APInt UndefElts(NumMaskElts, 0);
40651 APInt ZeroElts(NumMaskElts, 0);
40652 APInt ConstantElts(NumMaskElts, 0);
40653 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
40654 APInt::getZero(MaskSizeInBits));
40655 for (unsigned i = 0; i != NumMaskElts; ++i) {
40656 int M = Mask[i];
40657 if (M == SM_SentinelUndef) {
40658 UndefElts.setBit(i);
40659 continue;
40660 } else if (M == SM_SentinelZero) {
40661 ZeroElts.setBit(i);
40662 continue;
40663 }
40664 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
40665
40666 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
40667 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
40668
40669 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
40670 if (SrcUndefElts[SrcMaskIdx]) {
40671 UndefElts.setBit(i);
40672 continue;
40673 }
40674
40675 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
40676 APInt &Bits = SrcEltBits[SrcMaskIdx];
40677 if (!Bits) {
40678 ZeroElts.setBit(i);
40679 continue;
40680 }
40681
40682 ConstantElts.setBit(i);
40683 ConstantBitData[i] = Bits;
40684 }
40685 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
40686
40687 // Attempt to create a zero vector.
40688 if ((UndefElts | ZeroElts).isAllOnes())
40689 return getZeroVector(VT, Subtarget, DAG, DL);
40690
40691 // Create the constant data.
40692 MVT MaskSVT;
40693 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
40694 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
40695 else
40696 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
40697
40698 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
40699 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40700 return SDValue();
40701
40702 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
40703 return DAG.getBitcast(VT, CstOp);
40704}
40705
40706namespace llvm {
40707 namespace X86 {
40708 enum {
40711 } // namespace X86
40712} // namespace llvm
40713
40714/// Fully generic combining of x86 shuffle instructions.
40715///
40716/// This should be the last combine run over the x86 shuffle instructions. Once
40717/// they have been fully optimized, this will recursively consider all chains
40718/// of single-use shuffle instructions, build a generic model of the cumulative
40719/// shuffle operation, and check for simpler instructions which implement this
40720/// operation. We use this primarily for two purposes:
40721///
40722/// 1) Collapse generic shuffles to specialized single instructions when
40723/// equivalent. In most cases, this is just an encoding size win, but
40724/// sometimes we will collapse multiple generic shuffles into a single
40725/// special-purpose shuffle.
40726/// 2) Look for sequences of shuffle instructions with 3 or more total
40727/// instructions, and replace them with the slightly more expensive SSSE3
40728/// PSHUFB instruction if available. We do this as the last combining step
40729/// to ensure we avoid using PSHUFB if we can implement the shuffle with
40730/// a suitable short sequence of other instructions. The PSHUFB will either
40731/// use a register or have to read from memory and so is slightly (but only
40732/// slightly) more expensive than the other shuffle instructions.
40733///
40734/// Because this is inherently a quadratic operation (for each shuffle in
40735/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
40736/// This should never be an issue in practice as the shuffle lowering doesn't
40737/// produce sequences of more than 8 instructions.
40738///
40739/// FIXME: We will currently miss some cases where the redundant shuffling
40740/// would simplify under the threshold for PSHUFB formation because of
40741/// combine-ordering. To fix this, we should do the redundant instruction
40742/// combining in this recursive walk.
40744 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
40745 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
40746 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
40747 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40748 const X86Subtarget &Subtarget) {
40749 assert(!RootMask.empty() &&
40750 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
40751 "Illegal shuffle root mask");
40752 MVT RootVT = Root.getSimpleValueType();
40753 assert(RootVT.isVector() && "Shuffles operate on vector types!");
40754 unsigned RootSizeInBits = RootVT.getSizeInBits();
40755 SDLoc DL(Root);
40756
40757 // Bound the depth of our recursive combine because this is ultimately
40758 // quadratic in nature.
40759 if (Depth >= MaxDepth)
40760 return SDValue();
40761
40762 // Directly rip through bitcasts to find the underlying operand.
40763 SDValue Op = SrcOps[SrcOpIndex];
40765
40766 EVT VT = Op.getValueType();
40767 if (!VT.isVector() || !VT.isSimple())
40768 return SDValue(); // Bail if we hit a non-simple non-vector.
40769
40770 // FIXME: Just bail on f16 for now.
40771 if (VT.getVectorElementType() == MVT::f16)
40772 return SDValue();
40773
40774 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
40775 "Can only combine shuffles upto size of the root op.");
40776
40777 // Create a demanded elts mask from the referenced elements of Op.
40778 APInt OpDemandedElts = APInt::getZero(RootMask.size());
40779 for (int M : RootMask) {
40780 int BaseIdx = RootMask.size() * SrcOpIndex;
40781 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
40782 OpDemandedElts.setBit(M - BaseIdx);
40783 }
40784 if (RootSizeInBits != VT.getSizeInBits()) {
40785 // Op is smaller than Root - extract the demanded elts for the subvector.
40786 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
40787 unsigned NumOpMaskElts = RootMask.size() / Scale;
40788 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
40789 assert(OpDemandedElts
40790 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
40791 .isZero() &&
40792 "Out of range elements referenced in root mask");
40793 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
40794 }
40795 OpDemandedElts =
40796 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
40797
40798 // Extract target shuffle mask and resolve sentinels and inputs.
40799 SmallVector<int, 64> OpMask;
40800 SmallVector<SDValue, 2> OpInputs;
40801 APInt OpUndef, OpZero;
40802 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
40803 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
40804 OpZero, DAG, Depth, false)) {
40805 // Shuffle inputs must not be larger than the shuffle result.
40806 // TODO: Relax this for single input faux shuffles (e.g. trunc).
40807 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
40808 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
40809 }))
40810 return SDValue();
40811 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40812 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
40813 !isNullConstant(Op.getOperand(1))) {
40814 SDValue SrcVec = Op.getOperand(0);
40815 int ExtractIdx = Op.getConstantOperandVal(1);
40816 unsigned NumElts = VT.getVectorNumElements();
40817 OpInputs.assign({SrcVec});
40818 OpMask.assign(NumElts, SM_SentinelUndef);
40819 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
40820 OpZero = OpUndef = APInt::getZero(NumElts);
40821 } else {
40822 return SDValue();
40823 }
40824
40825 // If the shuffle result was smaller than the root, we need to adjust the
40826 // mask indices and pad the mask with undefs.
40827 if (RootSizeInBits > VT.getSizeInBits()) {
40828 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
40829 unsigned OpMaskSize = OpMask.size();
40830 if (OpInputs.size() > 1) {
40831 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
40832 for (int &M : OpMask) {
40833 if (M < 0)
40834 continue;
40835 int EltIdx = M % OpMaskSize;
40836 int OpIdx = M / OpMaskSize;
40837 M = (PaddedMaskSize * OpIdx) + EltIdx;
40838 }
40839 }
40840 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
40841 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
40842 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
40843 }
40844
40847
40848 // We don't need to merge masks if the root is empty.
40849 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
40850 if (EmptyRoot) {
40851 // Only resolve zeros if it will remove an input, otherwise we might end
40852 // up in an infinite loop.
40853 bool ResolveKnownZeros = true;
40854 if (!OpZero.isZero()) {
40855 APInt UsedInputs = APInt::getZero(OpInputs.size());
40856 for (int i = 0, e = OpMask.size(); i != e; ++i) {
40857 int M = OpMask[i];
40858 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
40859 continue;
40860 UsedInputs.setBit(M / OpMask.size());
40861 if (UsedInputs.isAllOnes()) {
40862 ResolveKnownZeros = false;
40863 break;
40864 }
40865 }
40866 }
40867 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
40868 ResolveKnownZeros);
40869
40870 Mask = OpMask;
40871 Ops.append(OpInputs.begin(), OpInputs.end());
40872 } else {
40873 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
40874
40875 // Add the inputs to the Ops list, avoiding duplicates.
40876 Ops.append(SrcOps.begin(), SrcOps.end());
40877
40878 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
40879 // Attempt to find an existing match.
40880 SDValue InputBC = peekThroughBitcasts(Input);
40881 for (int i = 0, e = Ops.size(); i < e; ++i)
40882 if (InputBC == peekThroughBitcasts(Ops[i]))
40883 return i;
40884 // Match failed - should we replace an existing Op?
40885 if (InsertionPoint >= 0) {
40886 Ops[InsertionPoint] = Input;
40887 return InsertionPoint;
40888 }
40889 // Add to the end of the Ops list.
40890 Ops.push_back(Input);
40891 return Ops.size() - 1;
40892 };
40893
40894 SmallVector<int, 2> OpInputIdx;
40895 for (SDValue OpInput : OpInputs)
40896 OpInputIdx.push_back(
40897 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
40898
40899 assert(((RootMask.size() > OpMask.size() &&
40900 RootMask.size() % OpMask.size() == 0) ||
40901 (OpMask.size() > RootMask.size() &&
40902 OpMask.size() % RootMask.size() == 0) ||
40903 OpMask.size() == RootMask.size()) &&
40904 "The smaller number of elements must divide the larger.");
40905
40906 // This function can be performance-critical, so we rely on the power-of-2
40907 // knowledge that we have about the mask sizes to replace div/rem ops with
40908 // bit-masks and shifts.
40909 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
40910 "Non-power-of-2 shuffle mask sizes");
40911 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
40912 "Non-power-of-2 shuffle mask sizes");
40913 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
40914 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
40915
40916 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
40917 unsigned RootRatio =
40918 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
40919 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
40920 assert((RootRatio == 1 || OpRatio == 1) &&
40921 "Must not have a ratio for both incoming and op masks!");
40922
40923 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
40924 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
40925 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
40926 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
40927 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
40928
40929 Mask.resize(MaskWidth, SM_SentinelUndef);
40930
40931 // Merge this shuffle operation's mask into our accumulated mask. Note that
40932 // this shuffle's mask will be the first applied to the input, followed by
40933 // the root mask to get us all the way to the root value arrangement. The
40934 // reason for this order is that we are recursing up the operation chain.
40935 for (unsigned i = 0; i < MaskWidth; ++i) {
40936 unsigned RootIdx = i >> RootRatioLog2;
40937 if (RootMask[RootIdx] < 0) {
40938 // This is a zero or undef lane, we're done.
40939 Mask[i] = RootMask[RootIdx];
40940 continue;
40941 }
40942
40943 unsigned RootMaskedIdx =
40944 RootRatio == 1
40945 ? RootMask[RootIdx]
40946 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
40947
40948 // Just insert the scaled root mask value if it references an input other
40949 // than the SrcOp we're currently inserting.
40950 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
40951 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
40952 Mask[i] = RootMaskedIdx;
40953 continue;
40954 }
40955
40956 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
40957 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
40958 if (OpMask[OpIdx] < 0) {
40959 // The incoming lanes are zero or undef, it doesn't matter which ones we
40960 // are using.
40961 Mask[i] = OpMask[OpIdx];
40962 continue;
40963 }
40964
40965 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
40966 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
40967 : (OpMask[OpIdx] << OpRatioLog2) +
40968 (RootMaskedIdx & (OpRatio - 1));
40969
40970 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
40971 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
40972 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
40973 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
40974
40975 Mask[i] = OpMaskedIdx;
40976 }
40977 }
40978
40979 // Peek through vector widenings and set out of bounds mask indices to undef.
40980 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
40981 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
40982 SDValue &Op = Ops[I];
40983 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
40984 isNullConstant(Op.getOperand(2))) {
40985 Op = Op.getOperand(1);
40986 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
40987 int Lo = I * Mask.size();
40988 int Hi = (I + 1) * Mask.size();
40989 int NewHi = Lo + (Mask.size() / Scale);
40990 for (int &M : Mask) {
40991 if (Lo <= M && NewHi <= M && M < Hi)
40992 M = SM_SentinelUndef;
40993 }
40994 }
40995 }
40996
40997 // Peek through any free extract_subvector nodes back to root size.
40998 for (SDValue &Op : Ops)
40999 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41000 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41001 isNullConstant(Op.getOperand(1)))
41002 Op = Op.getOperand(0);
41003
41004 // Remove unused/repeated shuffle source ops.
41006
41007 // Handle the all undef/zero/ones cases early.
41008 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41009 return DAG.getUNDEF(RootVT);
41010 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41011 return getZeroVector(RootVT, Subtarget, DAG, DL);
41012 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41014 return getOnesVector(RootVT, DAG, DL);
41015
41016 assert(!Ops.empty() && "Shuffle with no inputs detected");
41017 HasVariableMask |= IsOpVariableMask;
41018
41019 // Update the list of shuffle nodes that have been combined so far.
41020 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41021 CombinedNodes.push_back(Op.getNode());
41022
41023 // See if we can recurse into each shuffle source op (if it's a target
41024 // shuffle). The source op should only be generally combined if it either has
41025 // a single use (i.e. current Op) or all its users have already been combined,
41026 // if not then we can still combine but should prevent generation of variable
41027 // shuffles to avoid constant pool bloat.
41028 // Don't recurse if we already have more source ops than we can combine in
41029 // the remaining recursion depth.
41030 if (Ops.size() < (MaxDepth - Depth)) {
41031 for (int i = 0, e = Ops.size(); i < e; ++i) {
41032 // For empty roots, we need to resolve zeroable elements before combining
41033 // them with other shuffles.
41034 SmallVector<int, 64> ResolvedMask = Mask;
41035 if (EmptyRoot)
41036 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41037 bool AllowCrossLaneVar = false;
41038 bool AllowPerLaneVar = false;
41039 if (Ops[i].getNode()->hasOneUse() ||
41040 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41041 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41042 AllowPerLaneVar = AllowVariablePerLaneMask;
41043 }
41045 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41046 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41047 Subtarget))
41048 return Res;
41049 }
41050 }
41051
41052 // Attempt to constant fold all of the constant source ops.
41054 RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget))
41055 return Cst;
41056
41057 // If constant fold failed and we only have constants - then we have
41058 // multiple uses by a single non-variable shuffle - just bail.
41059 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41060 APInt UndefElts;
41061 SmallVector<APInt> RawBits;
41062 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41063 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41064 RawBits,
41065 /*AllowWholeUndefs*/ true,
41066 /*AllowPartialUndefs*/ true);
41067 })) {
41068 return SDValue();
41069 }
41070
41071 // Canonicalize the combined shuffle mask chain with horizontal ops.
41072 // NOTE: This will update the Ops and Mask.
41074 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41075 return DAG.getBitcast(RootVT, HOp);
41076
41077 // Try to refine our inputs given our knowledge of target shuffle mask.
41078 for (auto I : enumerate(Ops)) {
41079 int OpIdx = I.index();
41080 SDValue &Op = I.value();
41081
41082 // What range of shuffle mask element values results in picking from Op?
41083 int Lo = OpIdx * Mask.size();
41084 int Hi = Lo + Mask.size();
41085
41086 // Which elements of Op do we demand, given the mask's granularity?
41087 APInt OpDemandedElts(Mask.size(), 0);
41088 for (int MaskElt : Mask) {
41089 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41090 int OpEltIdx = MaskElt - Lo;
41091 OpDemandedElts.setBit(OpEltIdx);
41092 }
41093 }
41094
41095 // Is the shuffle result smaller than the root?
41096 if (Op.getValueSizeInBits() < RootSizeInBits) {
41097 // We padded the mask with undefs. But we now need to undo that.
41098 unsigned NumExpectedVectorElts = Mask.size();
41099 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41100 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41101 assert(!OpDemandedElts.extractBits(
41102 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41103 "Demanding the virtual undef widening padding?");
41104 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41105 }
41106
41107 // The Op itself may be of different VT, so we need to scale the mask.
41108 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41109 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41110
41111 // Can this operand be simplified any further, given it's demanded elements?
41112 if (SDValue NewOp =
41114 Op, OpScaledDemandedElts, DAG))
41115 Op = NewOp;
41116 }
41117 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41118
41119 // Widen any subvector shuffle inputs we've collected.
41120 // TODO: Remove this to avoid generating temporary nodes, we should only
41121 // widen once combineX86ShuffleChain has found a match.
41122 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41123 return Op.getValueSizeInBits() < RootSizeInBits;
41124 })) {
41125 for (SDValue &Op : Ops)
41126 if (Op.getValueSizeInBits() < RootSizeInBits)
41127 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41128 RootSizeInBits);
41129 // Reresolve - we might have repeated subvector sources.
41131 }
41132
41133 // We can only combine unary and binary shuffle mask cases.
41134 if (Ops.size() <= 2) {
41135 // Minor canonicalization of the accumulated shuffle mask to make it easier
41136 // to match below. All this does is detect masks with sequential pairs of
41137 // elements, and shrink them to the half-width mask. It does this in a loop
41138 // so it will reduce the size of the mask to the minimal width mask which
41139 // performs an equivalent shuffle.
41140 while (Mask.size() > 1) {
41141 SmallVector<int, 64> WidenedMask;
41142 if (!canWidenShuffleElements(Mask, WidenedMask))
41143 break;
41144 Mask = std::move(WidenedMask);
41145 }
41146
41147 // Canonicalization of binary shuffle masks to improve pattern matching by
41148 // commuting the inputs.
41149 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41151 std::swap(Ops[0], Ops[1]);
41152 }
41153
41154 // Try to combine into a single shuffle instruction.
41155 if (SDValue Shuffle = combineX86ShuffleChain(
41156 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41157 AllowVariablePerLaneMask, DAG, Subtarget))
41158 return Shuffle;
41159
41160 // If all the operands come from the same larger vector, fallthrough and try
41161 // to use combineX86ShuffleChainWithExtract.
41164 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41165 (RootSizeInBits / Mask.size()) != 64 ||
41166 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41167 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41168 LHS.getOperand(0) != RHS.getOperand(0))
41169 return SDValue();
41170 }
41171
41172 // If that failed and any input is extracted then try to combine as a
41173 // shuffle with the larger type.
41175 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41176 AllowVariablePerLaneMask, DAG, Subtarget);
41177}
41178
41179/// Helper entry wrapper to combineX86ShufflesRecursively.
41181 const X86Subtarget &Subtarget) {
41183 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41184 /*HasVarMask*/ false,
41185 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41186 Subtarget);
41187}
41188
41189/// Get the PSHUF-style mask from PSHUF node.
41190///
41191/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41192/// PSHUF-style masks that can be reused with such instructions.
41194 MVT VT = N.getSimpleValueType();
41197 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41198 (void)HaveMask;
41199 assert(HaveMask);
41200
41201 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41202 // matter. Check that the upper masks are repeats and remove them.
41203 if (VT.getSizeInBits() > 128) {
41204 int LaneElts = 128 / VT.getScalarSizeInBits();
41205#ifndef NDEBUG
41206 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41207 for (int j = 0; j < LaneElts; ++j)
41208 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41209 "Mask doesn't repeat in high 128-bit lanes!");
41210#endif
41211 Mask.resize(LaneElts);
41212 }
41213
41214 switch (N.getOpcode()) {
41215 case X86ISD::PSHUFD:
41216 return Mask;
41217 case X86ISD::PSHUFLW:
41218 Mask.resize(4);
41219 return Mask;
41220 case X86ISD::PSHUFHW:
41221 Mask.erase(Mask.begin(), Mask.begin() + 4);
41222 for (int &M : Mask)
41223 M -= 4;
41224 return Mask;
41225 default:
41226 llvm_unreachable("No valid shuffle instruction found!");
41227 }
41228}
41229
41230/// Search for a combinable shuffle across a chain ending in pshufd.
41231///
41232/// We walk up the chain and look for a combinable shuffle, skipping over
41233/// shuffles that we could hoist this shuffle's transformation past without
41234/// altering anything.
41237 const SDLoc &DL,
41238 SelectionDAG &DAG) {
41239 assert(N.getOpcode() == X86ISD::PSHUFD &&
41240 "Called with something other than an x86 128-bit half shuffle!");
41241
41242 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41243 // of the shuffles in the chain so that we can form a fresh chain to replace
41244 // this one.
41246 SDValue V = N.getOperand(0);
41247 for (; V.hasOneUse(); V = V.getOperand(0)) {
41248 switch (V.getOpcode()) {
41249 default:
41250 return SDValue(); // Nothing combined!
41251
41252 case ISD::BITCAST:
41253 // Skip bitcasts as we always know the type for the target specific
41254 // instructions.
41255 continue;
41256
41257 case X86ISD::PSHUFD:
41258 // Found another dword shuffle.
41259 break;
41260
41261 case X86ISD::PSHUFLW:
41262 // Check that the low words (being shuffled) are the identity in the
41263 // dword shuffle, and the high words are self-contained.
41264 if (Mask[0] != 0 || Mask[1] != 1 ||
41265 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41266 return SDValue();
41267
41268 Chain.push_back(V);
41269 continue;
41270
41271 case X86ISD::PSHUFHW:
41272 // Check that the high words (being shuffled) are the identity in the
41273 // dword shuffle, and the low words are self-contained.
41274 if (Mask[2] != 2 || Mask[3] != 3 ||
41275 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41276 return SDValue();
41277
41278 Chain.push_back(V);
41279 continue;
41280
41281 case X86ISD::UNPCKL:
41282 case X86ISD::UNPCKH:
41283 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41284 // shuffle into a preceding word shuffle.
41285 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41286 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41287 return SDValue();
41288
41289 // Search for a half-shuffle which we can combine with.
41290 unsigned CombineOp =
41291 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41292 if (V.getOperand(0) != V.getOperand(1) ||
41293 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41294 return SDValue();
41295 Chain.push_back(V);
41296 V = V.getOperand(0);
41297 do {
41298 switch (V.getOpcode()) {
41299 default:
41300 return SDValue(); // Nothing to combine.
41301
41302 case X86ISD::PSHUFLW:
41303 case X86ISD::PSHUFHW:
41304 if (V.getOpcode() == CombineOp)
41305 break;
41306
41307 Chain.push_back(V);
41308
41309 [[fallthrough]];
41310 case ISD::BITCAST:
41311 V = V.getOperand(0);
41312 continue;
41313 }
41314 break;
41315 } while (V.hasOneUse());
41316 break;
41317 }
41318 // Break out of the loop if we break out of the switch.
41319 break;
41320 }
41321
41322 if (!V.hasOneUse())
41323 // We fell out of the loop without finding a viable combining instruction.
41324 return SDValue();
41325
41326 // Merge this node's mask and our incoming mask.
41328 for (int &M : Mask)
41329 M = VMask[M];
41330 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41331 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41332
41333 // Rebuild the chain around this new shuffle.
41334 while (!Chain.empty()) {
41335 SDValue W = Chain.pop_back_val();
41336
41337 if (V.getValueType() != W.getOperand(0).getValueType())
41338 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41339
41340 switch (W.getOpcode()) {
41341 default:
41342 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41343
41344 case X86ISD::UNPCKL:
41345 case X86ISD::UNPCKH:
41346 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41347 break;
41348
41349 case X86ISD::PSHUFD:
41350 case X86ISD::PSHUFLW:
41351 case X86ISD::PSHUFHW:
41352 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41353 break;
41354 }
41355 }
41356 if (V.getValueType() != N.getValueType())
41357 V = DAG.getBitcast(N.getValueType(), V);
41358
41359 // Return the new chain to replace N.
41360 return V;
41361}
41362
41363// Attempt to commute shufps LHS loads:
41364// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41366 SelectionDAG &DAG) {
41367 // TODO: Add vXf64 support.
41368 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41369 return SDValue();
41370
41371 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41372 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41373 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41374 return SDValue();
41375 SDValue N0 = V.getOperand(0);
41376 SDValue N1 = V.getOperand(1);
41377 unsigned Imm = V.getConstantOperandVal(2);
41378 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41379 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41381 return SDValue();
41382 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41383 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41384 DAG.getTargetConstant(Imm, DL, MVT::i8));
41385 };
41386
41387 switch (N.getOpcode()) {
41388 case X86ISD::VPERMILPI:
41389 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41390 unsigned Imm = N.getConstantOperandVal(1);
41391 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41392 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41393 }
41394 break;
41395 case X86ISD::SHUFP: {
41396 SDValue N0 = N.getOperand(0);
41397 SDValue N1 = N.getOperand(1);
41398 unsigned Imm = N.getConstantOperandVal(2);
41399 if (N0 == N1) {
41400 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41401 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41402 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41403 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41404 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41405 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41406 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41407 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41408 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41409 }
41410 break;
41411 }
41412 }
41413
41414 return SDValue();
41415}
41416
41417// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41418// iff we don't demand the same element index for both X and Y.
41419static SDValue
41421 const APInt &DemandedElts, SelectionDAG &DAG,
41422 const X86Subtarget &Subtarget, const SDLoc &DL) {
41423 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41424 if (!N0.hasOneUse() || !N1.hasOneUse())
41425 return SDValue();
41426
41427 unsigned NumElts = VT.getVectorNumElements();
41430
41431 // See if both operands are shuffles, and that we can scale the shuffle masks
41432 // to the same width as the blend mask.
41433 // TODO: Support SM_SentinelZero?
41434 SmallVector<SDValue, 2> Ops0, Ops1;
41435 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41436 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41437 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41438 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41439 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41440 return SDValue();
41441
41442 // Determine the demanded elts from both permutes.
41443 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41444 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41445 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41446 Demanded1,
41447 /*AllowUndefElts=*/true) ||
41448 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41449 DemandedRHS0, /*AllowUndefElts=*/true) ||
41450 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41451 DemandedRHS1, /*AllowUndefElts=*/true))
41452 return SDValue();
41453
41454 // Confirm that we only use a single operand from both permutes and that we
41455 // don't demand the same index from both.
41456 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41457 DemandedLHS0.intersects(DemandedLHS1))
41458 return SDValue();
41459
41460 // Use the permute demanded elts masks as the new blend mask.
41461 // Create the new permute mask as a blend of the 2 original permute masks.
41462 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41463 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41464 for (unsigned I = 0; I != NumElts; ++I) {
41465 if (Demanded0[I]) {
41466 int M = ScaledMask0[I];
41467 if (0 <= M) {
41468 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41469 "BlendMask demands LHS AND RHS");
41470 NewBlendMask[M] = M;
41471 NewPermuteMask[I] = M;
41472 }
41473 } else if (Demanded1[I]) {
41474 int M = ScaledMask1[I];
41475 if (0 <= M) {
41476 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41477 "BlendMask demands LHS AND RHS");
41478 NewBlendMask[M] = M + NumElts;
41479 NewPermuteMask[I] = M;
41480 }
41481 }
41482 }
41483 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41484 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41485
41486 // v16i16 shuffles can explode in complexity very easily, only accept them if
41487 // the blend mask is the same in the 128-bit subvectors (or can widen to
41488 // v8i32) and the permute can be widened as well.
41489 if (VT == MVT::v16i16) {
41490 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41491 !canWidenShuffleElements(NewBlendMask))
41492 return SDValue();
41493 if (!canWidenShuffleElements(NewPermuteMask))
41494 return SDValue();
41495 }
41496
41497 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41498 // widened to a lane permute (vperm2f128).
41499 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41501 NewPermuteMask) &&
41502 !canScaleShuffleElements(NewPermuteMask, 2))
41503 return SDValue();
41504
41505 SDValue NewBlend =
41506 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41507 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41508 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41509 NewPermuteMask);
41510}
41511
41512// TODO - move this to TLI like isBinOp?
41513static bool isUnaryOp(unsigned Opcode) {
41514 switch (Opcode) {
41515 case ISD::CTLZ:
41516 case ISD::CTTZ:
41517 case ISD::CTPOP:
41518 return true;
41519 }
41520 return false;
41521}
41522
41523// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41524// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41526 const SDLoc &DL) {
41527 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41528 EVT ShuffleVT = N.getValueType();
41529 unsigned Opc = N.getOpcode();
41530
41531 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
41532 bool FoldLoad = false) {
41533 // AllZeros/AllOnes constants are freely shuffled and will peek through
41534 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41535 // merge with target shuffles if it has one use so shuffle combining is
41536 // likely to kick in. Shuffles of splats are expected to be removed.
41537 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41538 ISD::isBuildVectorAllZeros(Op.getNode()) ||
41541 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
41542 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
41543 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41544 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41545 (FoldLoad && isShuffleFoldableLoad(Op)) ||
41546 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41547 };
41548 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41549 // Ensure we only shuffle whole vector src elements, unless its a logical
41550 // binops where we can more aggressively move shuffles from dst to src.
41551 return isLogicOp(BinOp) ||
41552 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41553 };
41554
41555 switch (Opc) {
41556 // Unary and Unary+Permute Shuffles.
41557 case X86ISD::PSHUFB: {
41558 // Don't merge PSHUFB if it contains zero'd elements.
41559 SmallVector<int> Mask;
41561 if (!getTargetShuffleMask(N, false, Ops, Mask))
41562 break;
41563 [[fallthrough]];
41564 }
41565 case X86ISD::VBROADCAST:
41566 case X86ISD::MOVDDUP:
41567 case X86ISD::PSHUFD:
41568 case X86ISD::PSHUFHW:
41569 case X86ISD::PSHUFLW:
41570 case X86ISD::VPERMI:
41571 case X86ISD::VPERMILPI: {
41572 if (N.getOperand(0).getValueType() == ShuffleVT &&
41573 N->isOnlyUserOf(N.getOperand(0).getNode())) {
41574 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41575 unsigned SrcOpcode = N0.getOpcode();
41576 EVT OpVT = N0.getValueType();
41577 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41580 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
41581 Opc != X86ISD::PSHUFB) ||
41582 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
41583 Opc != X86ISD::PSHUFB)) {
41584 SDValue LHS, RHS;
41585 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41586 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41587 if (N.getNumOperands() == 2) {
41588 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41589 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41590 } else {
41591 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41592 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41593 }
41594 return DAG.getBitcast(ShuffleVT,
41595 DAG.getNode(SrcOpcode, DL, OpVT,
41596 DAG.getBitcast(OpVT, LHS),
41597 DAG.getBitcast(OpVT, RHS)));
41598 }
41599 }
41600 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
41601 OpVT.getScalarSizeInBits() ==
41603 SDValue Op00 = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
41604 SDValue Res =
41605 N.getNumOperands() == 2
41606 ? DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1))
41607 : DAG.getNode(Opc, DL, ShuffleVT, Op00);
41608 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
41609 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
41610 }
41611 }
41612 break;
41613 }
41614 // Binary and Binary+Permute Shuffles.
41615 case X86ISD::INSERTPS: {
41616 // Don't merge INSERTPS if it contains zero'd elements.
41617 unsigned InsertPSMask = N.getConstantOperandVal(2);
41618 unsigned ZeroMask = InsertPSMask & 0xF;
41619 if (ZeroMask != 0)
41620 break;
41621 [[fallthrough]];
41622 }
41623 case X86ISD::MOVSD:
41624 case X86ISD::MOVSS:
41625 case X86ISD::BLENDI:
41626 case X86ISD::SHUFP:
41627 case X86ISD::UNPCKH:
41628 case X86ISD::UNPCKL: {
41629 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41630 N->isOnlyUserOf(N.getOperand(1).getNode())) {
41631 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41632 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41633 unsigned SrcOpcode = N0.getOpcode();
41634 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41635 N0.getValueType() == N1.getValueType() &&
41636 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41637 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41642 // Ensure the total number of shuffles doesn't increase by folding this
41643 // shuffle through to the source ops.
41644 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41645 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41646 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41647 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41648 SDValue LHS, RHS;
41649 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41650 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41651 Op01 = DAG.getBitcast(ShuffleVT, Op01);
41652 Op11 = DAG.getBitcast(ShuffleVT, Op11);
41653 if (N.getNumOperands() == 3) {
41654 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41655 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41656 } else {
41657 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41658 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41659 }
41660 EVT OpVT = N0.getValueType();
41661 return DAG.getBitcast(ShuffleVT,
41662 DAG.getNode(SrcOpcode, DL, OpVT,
41663 DAG.getBitcast(OpVT, LHS),
41664 DAG.getBitcast(OpVT, RHS)));
41665 }
41666 }
41667 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41668 N0.getValueType() == N1.getValueType() &&
41669 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41670 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41673 SDValue Res;
41674 Op00 = DAG.getBitcast(ShuffleVT, Op00);
41675 Op10 = DAG.getBitcast(ShuffleVT, Op10);
41676 if (N.getNumOperands() == 3) {
41677 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41678 } else {
41679 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41680 }
41681 EVT OpVT = N0.getValueType();
41682 return DAG.getBitcast(
41683 ShuffleVT,
41684 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
41685 }
41686 // TODO: We can generalize this for other shuffles/conversions.
41687 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
41688 N1.getOpcode() == SrcOpcode &&
41689 N0.getValueType() == N1.getValueType() &&
41690 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
41691 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
41692 IsSafeToMoveShuffle(N0, SrcOpcode) &&
41693 IsSafeToMoveShuffle(N1, SrcOpcode)) {
41694 EVT OpSrcVT = N0.getOperand(0).getValueType();
41695 EVT OpDstVT = N0.getValueType();
41696 SDValue Res =
41697 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
41698 return DAG.getBitcast(ShuffleVT,
41699 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
41700 }
41701 }
41702 break;
41703 }
41704 }
41705 return SDValue();
41706}
41707
41708/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41710 SelectionDAG &DAG,
41711 const SDLoc &DL) {
41712 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
41713
41714 MVT VT = V.getSimpleValueType();
41715 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41716 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41717 unsigned SrcOpc0 = Src0.getOpcode();
41718 unsigned SrcOpc1 = Src1.getOpcode();
41719 EVT SrcVT0 = Src0.getValueType();
41720 EVT SrcVT1 = Src1.getValueType();
41721
41722 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41723 return SDValue();
41724
41725 switch (SrcOpc0) {
41726 case X86ISD::MOVDDUP: {
41727 SDValue LHS = Src0.getOperand(0);
41728 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41729 SDValue Res =
41730 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41731 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41732 return DAG.getBitcast(VT, Res);
41733 }
41734 case X86ISD::VPERMILPI:
41735 // TODO: Handle v4f64 permutes with different low/high lane masks.
41736 if (SrcVT0 == MVT::v4f64) {
41737 uint64_t Mask = Src0.getConstantOperandVal(1);
41738 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41739 break;
41740 }
41741 [[fallthrough]];
41742 case X86ISD::VSHLI:
41743 case X86ISD::VSRLI:
41744 case X86ISD::VSRAI:
41745 case X86ISD::PSHUFD:
41746 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41747 SDValue LHS = Src0.getOperand(0);
41748 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41749 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41750 V.getOperand(2));
41751 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41752 return DAG.getBitcast(VT, Res);
41753 }
41754 break;
41755 }
41756
41757 return SDValue();
41758}
41759
41760static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
41763 const X86Subtarget &Subtarget);
41764
41765/// Try to combine x86 target specific shuffles.
41767 SelectionDAG &DAG,
41769 const X86Subtarget &Subtarget) {
41770 using namespace SDPatternMatch;
41771
41772 MVT VT = N.getSimpleValueType();
41773 unsigned NumElts = VT.getVectorNumElements();
41775 unsigned Opcode = N.getOpcode();
41776 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41777
41778 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
41779 return R;
41780
41781 // Handle specific target shuffles.
41782 switch (Opcode) {
41783 case X86ISD::MOVDDUP: {
41784 SDValue Src = N.getOperand(0);
41785 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
41786 if (VT == MVT::v2f64 && Src.hasOneUse() &&
41787 ISD::isNormalLoad(Src.getNode())) {
41788 LoadSDNode *LN = cast<LoadSDNode>(Src);
41789 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
41790 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
41791 DCI.CombineTo(N.getNode(), Movddup);
41792 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
41794 return N; // Return N so it doesn't get rechecked!
41795 }
41796 }
41797
41798 return SDValue();
41799 }
41800 case X86ISD::VBROADCAST: {
41801 SDValue Src = N.getOperand(0);
41802 SDValue BC = peekThroughBitcasts(Src);
41803 EVT SrcVT = Src.getValueType();
41804 EVT BCVT = BC.getValueType();
41805
41806 // If broadcasting from another shuffle, attempt to simplify it.
41807 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
41808 if (isTargetShuffle(BC.getOpcode()) &&
41809 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
41810 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
41811 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
41813 for (unsigned i = 0; i != Scale; ++i)
41814 DemandedMask[i] = i;
41816 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
41818 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
41819 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
41820 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41821 DAG.getBitcast(SrcVT, Res));
41822 }
41823
41824 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
41825 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
41826 if (Src.getOpcode() == ISD::BITCAST &&
41827 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
41828 TLI.isTypeLegal(BCVT) &&
41830 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
41831 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
41833 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41834 }
41835
41836 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
41837 // If we're re-broadcasting a smaller type then broadcast with that type and
41838 // bitcast.
41839 // TODO: Do this for any splat?
41840 if (Src.getOpcode() == ISD::BITCAST &&
41841 (BC.getOpcode() == X86ISD::VBROADCAST ||
41843 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
41844 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
41845 MVT NewVT =
41847 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
41848 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
41849 }
41850
41851 // Reduce broadcast source vector to lowest 128-bits.
41852 if (SrcVT.getSizeInBits() > 128)
41853 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
41854 extract128BitVector(Src, 0, DAG, DL));
41855
41856 // broadcast(scalar_to_vector(x)) -> broadcast(x).
41857 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
41858 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
41859 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41860
41861 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
41862 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41863 isNullConstant(Src.getOperand(1)) &&
41864 Src.getValueType() ==
41865 Src.getOperand(0).getValueType().getScalarType() &&
41866 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
41867 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
41868
41869 // Share broadcast with the longest vector and extract low subvector (free).
41870 // Ensure the same SDValue from the SDNode use is being used.
41871 for (SDNode *User : Src->users())
41872 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
41873 Src == User->getOperand(0) &&
41874 User->getValueSizeInBits(0).getFixedValue() >
41875 VT.getFixedSizeInBits()) {
41876 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
41877 VT.getSizeInBits());
41878 }
41879
41880 // vbroadcast(scalarload X) -> vbroadcast_load X
41881 // For float loads, extract other uses of the scalar from the broadcast.
41882 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
41883 ISD::isNormalLoad(Src.getNode())) {
41884 LoadSDNode *LN = cast<LoadSDNode>(Src);
41885 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41886 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41887 SDValue BcastLd =
41889 LN->getMemoryVT(), LN->getMemOperand());
41890 // If the load value is used only by N, replace it via CombineTo N.
41891 bool NoReplaceExtract = Src.hasOneUse();
41892 DCI.CombineTo(N.getNode(), BcastLd);
41893 if (NoReplaceExtract) {
41894 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41896 } else {
41897 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
41898 DAG.getVectorIdxConstant(0, DL));
41899 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
41900 }
41901 return N; // Return N so it doesn't get rechecked!
41902 }
41903
41904 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
41905 // i16. So shrink it ourselves if we can make a broadcast_load.
41906 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
41907 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
41908 assert(Subtarget.hasAVX2() && "Expected AVX2");
41909 SDValue TruncIn = Src.getOperand(0);
41910
41911 // If this is a truncate of a non extending load we can just narrow it to
41912 // use a broadcast_load.
41913 if (ISD::isNormalLoad(TruncIn.getNode())) {
41914 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
41915 // Unless its volatile or atomic.
41916 if (LN->isSimple()) {
41917 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41918 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41919 SDValue BcastLd = DAG.getMemIntrinsicNode(
41920 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41921 LN->getPointerInfo(), LN->getOriginalAlign(),
41922 LN->getMemOperand()->getFlags());
41923 DCI.CombineTo(N.getNode(), BcastLd);
41924 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41925 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41926 return N; // Return N so it doesn't get rechecked!
41927 }
41928 }
41929
41930 // If this is a truncate of an i16 extload, we can directly replace it.
41931 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
41932 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
41933 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
41934 if (LN->getMemoryVT().getSizeInBits() == 16) {
41935 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41936 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41937 SDValue BcastLd =
41939 LN->getMemoryVT(), LN->getMemOperand());
41940 DCI.CombineTo(N.getNode(), BcastLd);
41941 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41942 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41943 return N; // Return N so it doesn't get rechecked!
41944 }
41945 }
41946
41947 // If this is a truncate of load that has been shifted right, we can
41948 // offset the pointer and use a narrower load.
41949 if (TruncIn.getOpcode() == ISD::SRL &&
41950 TruncIn.getOperand(0).hasOneUse() &&
41951 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
41952 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
41953 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
41954 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
41955 // Make sure the shift amount and the load size are divisible by 16.
41956 // Don't do this if the load is volatile or atomic.
41957 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
41958 LN->isSimple()) {
41959 unsigned Offset = ShiftAmt / 8;
41960 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41963 SDValue Ops[] = { LN->getChain(), Ptr };
41964 SDValue BcastLd = DAG.getMemIntrinsicNode(
41965 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
41967 LN->getOriginalAlign(),
41968 LN->getMemOperand()->getFlags());
41969 DCI.CombineTo(N.getNode(), BcastLd);
41970 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41971 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
41972 return N; // Return N so it doesn't get rechecked!
41973 }
41974 }
41975 }
41976
41977 // vbroadcast(vzload X) -> vbroadcast_load X
41978 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
41979 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
41980 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
41981 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
41982 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
41983 SDValue BcastLd =
41985 LN->getMemoryVT(), LN->getMemOperand());
41986 DCI.CombineTo(N.getNode(), BcastLd);
41987 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
41989 return N; // Return N so it doesn't get rechecked!
41990 }
41991 }
41992
41993 // vbroadcast(vector load X) -> vbroadcast_load
41994 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
41995 SrcVT == MVT::v4i32) &&
41996 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
41997 LoadSDNode *LN = cast<LoadSDNode>(Src);
41998 // Unless the load is volatile or atomic.
41999 if (LN->isSimple()) {
42000 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42001 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42002 SDValue BcastLd = DAG.getMemIntrinsicNode(
42003 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42004 LN->getPointerInfo(), LN->getOriginalAlign(),
42005 LN->getMemOperand()->getFlags());
42006 DCI.CombineTo(N.getNode(), BcastLd);
42007 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42009 return N; // Return N so it doesn't get rechecked!
42010 }
42011 }
42012
42013 return SDValue();
42014 }
42015 case X86ISD::VZEXT_MOVL: {
42016 SDValue N0 = N.getOperand(0);
42017
42018 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42019 // the load is volatile.
42020 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42021 auto *LN = cast<LoadSDNode>(N0);
42022 if (SDValue VZLoad =
42023 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42024 DCI.CombineTo(N.getNode(), VZLoad);
42025 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42027 return N;
42028 }
42029 }
42030
42031 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42032 // and can just use a VZEXT_LOAD.
42033 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42034 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42035 auto *LN = cast<MemSDNode>(N0);
42036 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42037 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42038 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42039 SDValue VZLoad =
42041 LN->getMemoryVT(), LN->getMemOperand());
42042 DCI.CombineTo(N.getNode(), VZLoad);
42043 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42045 return N;
42046 }
42047 }
42048
42049 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42050 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42051 // if the upper bits of the i64 are zero.
42052 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42053 N0.getOperand(0).hasOneUse() &&
42054 N0.getOperand(0).getValueType() == MVT::i64) {
42055 SDValue In = N0.getOperand(0);
42056 APInt Mask = APInt::getHighBitsSet(64, 32);
42057 if (DAG.MaskedValueIsZero(In, Mask)) {
42058 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42059 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42060 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42061 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42062 return DAG.getBitcast(VT, Movl);
42063 }
42064 }
42065
42066 // Load a scalar integer constant directly to XMM instead of transferring an
42067 // immediate value from GPR.
42068 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42069 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42070 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42071 // Create a vector constant - scalar constant followed by zeros.
42072 EVT ScalarVT = N0.getOperand(0).getValueType();
42073 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42074 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42075 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42076 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42077
42078 // Load the vector constant from constant pool.
42079 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42080 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42081 MachinePointerInfo MPI =
42083 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42084 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42086 }
42087 }
42088
42089 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42090 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42091 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42092 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42093 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42095
42096 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42097 isNullConstant(V.getOperand(2))) {
42098 SDValue In = V.getOperand(1);
42100 In.getValueSizeInBits() /
42101 VT.getScalarSizeInBits());
42102 In = DAG.getBitcast(SubVT, In);
42103 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42104 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42105 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42106 V.getOperand(2));
42107 }
42108 }
42109
42110 return SDValue();
42111 }
42112 case X86ISD::BLENDI: {
42113 SDValue N0 = N.getOperand(0);
42114 SDValue N1 = N.getOperand(1);
42115 unsigned EltBits = VT.getScalarSizeInBits();
42116
42117 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42118 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42119 // TODO: Handle MVT::v16i16 repeated blend mask.
42120 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42121 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42122 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42123 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42124 unsigned NewSize = SrcVT.getVectorNumElements();
42125 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(NumElts);
42126 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42127 return DAG.getBitcast(
42128 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42129 N1.getOperand(0),
42130 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42131 DL, MVT::i8)));
42132 }
42133 }
42134 // Share PSHUFB masks:
42135 // blend(pshufb(x,m1),pshufb(y,m2))
42136 // --> m3 = blend(m1,m2)
42137 // blend(pshufb(x,m3),pshufb(y,m3))
42138 if (N0.hasOneUse() && N1.hasOneUse()) {
42139 SmallVector<int> Mask, ByteMask;
42143 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42144 RHS.getOpcode() == X86ISD::PSHUFB &&
42145 LHS.getOperand(1) != RHS.getOperand(1) &&
42146 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42147 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42148 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42149 RHS == peekThroughOneUseBitcasts(Ops[1]) &&
42150 "BLENDI decode mismatch");
42151 MVT ShufVT = LHS.getSimpleValueType();
42152 SDValue MaskLHS = LHS.getOperand(1);
42153 SDValue MaskRHS = RHS.getOperand(1);
42154 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42156 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42157 /*HasVariableMask=*/true, DAG, DL, Subtarget)) {
42158 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42159 LHS.getOperand(0), NewMask);
42160 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42161 RHS.getOperand(0), NewMask);
42162 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42163 DAG.getBitcast(VT, NewLHS),
42164 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42165 }
42166 }
42167 }
42168 }
42169 return SDValue();
42170 }
42171 case X86ISD::SHUFP: {
42172 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42173 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42174 // TODO: Support types other than v4f32.
42175 if (VT == MVT::v4f32) {
42176 bool Updated = false;
42177 SmallVector<int> Mask;
42179 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42180 for (int i = 0; i != 2; ++i) {
42181 SmallVector<SDValue> SubOps;
42182 SmallVector<int> SubMask, SubScaledMask;
42183 SDValue Sub = peekThroughBitcasts(Ops[i]);
42184 // TODO: Scaling might be easier if we specify the demanded elts.
42185 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42186 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42187 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42188 int Ofs = i * 2;
42189 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42190 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42191 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42192 Updated = true;
42193 }
42194 }
42195 }
42196 if (Updated) {
42197 for (int &M : Mask)
42198 M %= 4;
42199 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42200 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42201 }
42202 }
42203 return SDValue();
42204 }
42205 case X86ISD::VPERMI: {
42206 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42207 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42208 SDValue N0 = N.getOperand(0);
42209 SDValue N1 = N.getOperand(1);
42210 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42211 if (N0.getOpcode() == ISD::BITCAST &&
42212 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42213 SDValue Src = N0.getOperand(0);
42214 EVT SrcVT = Src.getValueType();
42215 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42216 return DAG.getBitcast(VT, Res);
42217 }
42218 return SDValue();
42219 }
42220 case X86ISD::SHUF128: {
42221 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42222 // see if we can peek through and access the subvector directly.
42223 if (VT.is512BitVector()) {
42224 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
42225 // upper subvector is used.
42226 SDValue LHS = N->getOperand(0);
42227 SDValue RHS = N->getOperand(1);
42228 uint64_t Mask = N->getConstantOperandVal(2);
42229 SmallVector<SDValue> LHSOps, RHSOps;
42230 SDValue NewLHS, NewRHS;
42231 if ((Mask & 0x0A) == 0x0A &&
42232 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42233 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42234 Mask &= ~0x0A;
42235 }
42236 if ((Mask & 0xA0) == 0xA0 &&
42237 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42238 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42239 Mask &= ~0xA0;
42240 }
42241 if (NewLHS || NewRHS)
42242 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
42243 NewRHS ? NewRHS : RHS,
42244 DAG.getTargetConstant(Mask, DL, MVT::i8));
42245 }
42246 return SDValue();
42247 }
42248 case X86ISD::VPERM2X128: {
42249 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42250 SDValue LHS = N->getOperand(0);
42251 SDValue RHS = N->getOperand(1);
42252 if (LHS.getOpcode() == ISD::BITCAST &&
42253 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42254 EVT SrcVT = LHS.getOperand(0).getValueType();
42255 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42256 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42257 DAG.getBitcast(SrcVT, LHS),
42258 DAG.getBitcast(SrcVT, RHS),
42259 N->getOperand(2)));
42260 }
42261 }
42262
42263 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42265 return Res;
42266
42267 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42268 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42269 auto FindSubVector128 = [&](unsigned Idx) {
42270 if (Idx > 3)
42271 return SDValue();
42272 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42273 SmallVector<SDValue> SubOps;
42274 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42275 return SubOps[Idx & 1];
42276 unsigned NumElts = Src.getValueType().getVectorNumElements();
42277 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42278 Src.getOperand(1).getValueSizeInBits() == 128 &&
42279 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42280 return Src.getOperand(1);
42281 }
42282 return SDValue();
42283 };
42284 unsigned Imm = N.getConstantOperandVal(2);
42285 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42286 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42287 MVT SubVT = VT.getHalfNumVectorElementsVT();
42288 SubLo = DAG.getBitcast(SubVT, SubLo);
42289 SubHi = DAG.getBitcast(SubVT, SubHi);
42290 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42291 }
42292 }
42293 return SDValue();
42294 }
42295 case X86ISD::PSHUFD:
42296 case X86ISD::PSHUFLW:
42297 case X86ISD::PSHUFHW: {
42298 SDValue N0 = N.getOperand(0);
42299 SDValue N1 = N.getOperand(1);
42300 if (N0->hasOneUse()) {
42302 switch (V.getOpcode()) {
42303 case X86ISD::VSHL:
42304 case X86ISD::VSRL:
42305 case X86ISD::VSRA:
42306 case X86ISD::VSHLI:
42307 case X86ISD::VSRLI:
42308 case X86ISD::VSRAI:
42309 case X86ISD::VROTLI:
42310 case X86ISD::VROTRI: {
42311 MVT InnerVT = V.getSimpleValueType();
42312 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42313 SDValue Res = DAG.getNode(Opcode, DL, VT,
42314 DAG.getBitcast(VT, V.getOperand(0)), N1);
42315 Res = DAG.getBitcast(InnerVT, Res);
42316 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42317 return DAG.getBitcast(VT, Res);
42318 }
42319 break;
42320 }
42321 }
42322 }
42323
42324 Mask = getPSHUFShuffleMask(N);
42325 assert(Mask.size() == 4);
42326 break;
42327 }
42328 case X86ISD::MOVSD:
42329 case X86ISD::MOVSH:
42330 case X86ISD::MOVSS: {
42331 SDValue N0 = N.getOperand(0);
42332 SDValue N1 = N.getOperand(1);
42333
42334 // Canonicalize scalar FPOps:
42335 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42336 // If commutable, allow OP(N1[0], N0[0]).
42337 unsigned Opcode1 = N1.getOpcode();
42338 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42339 Opcode1 == ISD::FDIV) {
42340 SDValue N10 = N1.getOperand(0);
42341 SDValue N11 = N1.getOperand(1);
42342 if (N10 == N0 ||
42343 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42344 if (N10 != N0)
42345 std::swap(N10, N11);
42346 MVT SVT = VT.getVectorElementType();
42347 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42348 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42349 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42350 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42351 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42352 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42353 }
42354 }
42355
42356 return SDValue();
42357 }
42358 case X86ISD::INSERTPS: {
42359 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42360 SDValue Op0 = N.getOperand(0);
42361 SDValue Op1 = N.getOperand(1);
42362 unsigned InsertPSMask = N.getConstantOperandVal(2);
42363 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42364 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42365 unsigned ZeroMask = InsertPSMask & 0xF;
42366
42367 // If we zero out all elements from Op0 then we don't need to reference it.
42368 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42369 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42370 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42371
42372 // If we zero out the element from Op1 then we don't need to reference it.
42373 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42374 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42375 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42376
42377 // Attempt to merge insertps Op1 with an inner target shuffle node.
42378 SmallVector<int, 8> TargetMask1;
42380 APInt KnownUndef1, KnownZero1;
42381 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42382 KnownZero1)) {
42383 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42384 // Zero/UNDEF insertion - zero out element and remove dependency.
42385 InsertPSMask |= (1u << DstIdx);
42386 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42387 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42388 }
42389 // Update insertps mask srcidx and reference the source input directly.
42390 int M = TargetMask1[SrcIdx];
42391 assert(0 <= M && M < 8 && "Shuffle index out of range");
42392 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42393 Op1 = Ops1[M < 4 ? 0 : 1];
42394 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42395 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42396 }
42397
42398 // Attempt to merge insertps Op0 with an inner target shuffle node.
42399 SmallVector<int, 8> TargetMask0;
42401 APInt KnownUndef0, KnownZero0;
42402 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42403 KnownZero0)) {
42404 bool Updated = false;
42405 bool UseInput00 = false;
42406 bool UseInput01 = false;
42407 for (int i = 0; i != 4; ++i) {
42408 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42409 // No change if element is already zero or the inserted element.
42410 continue;
42411 }
42412
42413 if (KnownUndef0[i] || KnownZero0[i]) {
42414 // If the target mask is undef/zero then we must zero the element.
42415 InsertPSMask |= (1u << i);
42416 Updated = true;
42417 continue;
42418 }
42419
42420 // The input vector element must be inline.
42421 int M = TargetMask0[i];
42422 if (M != i && M != (i + 4))
42423 return SDValue();
42424
42425 // Determine which inputs of the target shuffle we're using.
42426 UseInput00 |= (0 <= M && M < 4);
42427 UseInput01 |= (4 <= M);
42428 }
42429
42430 // If we're not using both inputs of the target shuffle then use the
42431 // referenced input directly.
42432 if (UseInput00 && !UseInput01) {
42433 Updated = true;
42434 Op0 = Ops0[0];
42435 } else if (!UseInput00 && UseInput01) {
42436 Updated = true;
42437 Op0 = Ops0[1];
42438 }
42439
42440 if (Updated)
42441 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42442 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42443 }
42444
42445 // If we're inserting an element from a vbroadcast load, fold the
42446 // load into the X86insertps instruction. We need to convert the scalar
42447 // load to a vector and clear the source lane of the INSERTPS control.
42448 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42449 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42450 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42451 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42452 MemIntr->getBasePtr(),
42453 MemIntr->getMemOperand());
42454 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42456 Load),
42457 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42458 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42459 return Insert;
42460 }
42461 }
42462
42463 return SDValue();
42464 }
42465 case X86ISD::VPERMV3: {
42466 // Combine VPERMV3 to widened VPERMV if the two source operands can be
42467 // freely concatenated.
42468 if (VT.is128BitVector() ||
42469 (VT.is256BitVector() && Subtarget.useAVX512Regs())) {
42470 SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
42471 MVT WideVT = VT.getDoubleNumVectorElementsVT();
42472 if (SDValue ConcatSrc =
42473 combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) {
42474 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
42475 DL, WideVT.getSizeInBits());
42476 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
42477 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42478 DAG.getVectorIdxConstant(0, DL));
42479 }
42480 }
42483 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42484 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42485 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
42486 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
42487 MVT MaskVT = N.getOperand(1).getSimpleValueType();
42488 // Canonicalize to VPERMV if both sources are the same.
42489 if (V1 == V2) {
42490 for (int &M : Mask)
42491 M = (M < 0 ? M : M & (Mask.size() - 1));
42492 SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42493 /*IsMask=*/true);
42494 return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, N.getOperand(0));
42495 }
42496 // If sources are half width, then concat and use VPERMV with adjusted
42497 // mask.
42498 SDValue Ops[2];
42499 MVT HalfVT = VT.getHalfNumVectorElementsVT();
42500 if (sd_match(V1,
42501 m_InsertSubvector(m_Undef(), m_Value(Ops[0]), m_Zero())) &&
42502 sd_match(V2,
42503 m_InsertSubvector(m_Undef(), m_Value(Ops[1]), m_Zero())) &&
42504 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
42505 if (SDValue ConcatSrc =
42506 combineConcatVectorOps(DL, VT, Ops, DAG, DCI, Subtarget)) {
42507 for (int &M : Mask)
42508 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
42509 SDValue NewMask = getConstVector(Mask, MaskVT, DAG, DL,
42510 /*IsMask=*/true);
42511 return DAG.getNode(X86ISD::VPERMV, DL, VT, NewMask, ConcatSrc);
42512 }
42513 }
42514 // Commute foldable source to the RHS.
42515 if (isShuffleFoldableLoad(N.getOperand(0)) &&
42516 !isShuffleFoldableLoad(N.getOperand(2))) {
42518 SDValue NewMask =
42519 getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true);
42520 return DAG.getNode(X86ISD::VPERMV3, DL, VT, N.getOperand(2), NewMask,
42521 N.getOperand(0));
42522 }
42523 }
42524 return SDValue();
42525 }
42526 default:
42527 return SDValue();
42528 }
42529
42530 // Nuke no-op shuffles that show up after combining.
42531 if (isNoopShuffleMask(Mask))
42532 return N.getOperand(0);
42533
42534 // Look for simplifications involving one or two shuffle instructions.
42535 SDValue V = N.getOperand(0);
42536 switch (N.getOpcode()) {
42537 default:
42538 break;
42539 case X86ISD::PSHUFLW:
42540 case X86ISD::PSHUFHW:
42541 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
42542
42543 // See if this reduces to a PSHUFD which is no more expensive and can
42544 // combine with more operations. Note that it has to at least flip the
42545 // dwords as otherwise it would have been removed as a no-op.
42546 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
42547 int DMask[] = {0, 1, 2, 3};
42548 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42549 DMask[DOffset + 0] = DOffset + 1;
42550 DMask[DOffset + 1] = DOffset + 0;
42551 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
42552 V = DAG.getBitcast(DVT, V);
42553 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42554 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42555 return DAG.getBitcast(VT, V);
42556 }
42557
42558 // Look for shuffle patterns which can be implemented as a single unpack.
42559 // FIXME: This doesn't handle the location of the PSHUFD generically, and
42560 // only works when we have a PSHUFD followed by two half-shuffles.
42561 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42562 (V.getOpcode() == X86ISD::PSHUFLW ||
42563 V.getOpcode() == X86ISD::PSHUFHW) &&
42564 V.getOpcode() != N.getOpcode() &&
42565 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42566 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42567 if (D.getOpcode() == X86ISD::PSHUFD) {
42570 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42571 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42572 int WordMask[8];
42573 for (int i = 0; i < 4; ++i) {
42574 WordMask[i + NOffset] = Mask[i] + NOffset;
42575 WordMask[i + VOffset] = VMask[i] + VOffset;
42576 }
42577 // Map the word mask through the DWord mask.
42578 int MappedMask[8];
42579 for (int i = 0; i < 8; ++i)
42580 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42581 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42582 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42583 // We can replace all three shuffles with an unpack.
42584 V = DAG.getBitcast(VT, D.getOperand(0));
42585 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42587 DL, VT, V, V);
42588 }
42589 }
42590 }
42591
42592 break;
42593
42594 case X86ISD::PSHUFD:
42595 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
42596 return NewN;
42597
42598 break;
42599 }
42600
42601 return SDValue();
42602}
42603
42604/// Checks if the shuffle mask takes subsequent elements
42605/// alternately from two vectors.
42606/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42607static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42608
42609 int ParitySrc[2] = {-1, -1};
42610 unsigned Size = Mask.size();
42611 for (unsigned i = 0; i != Size; ++i) {
42612 int M = Mask[i];
42613 if (M < 0)
42614 continue;
42615
42616 // Make sure we are using the matching element from the input.
42617 if ((M % Size) != i)
42618 return false;
42619
42620 // Make sure we use the same input for all elements of the same parity.
42621 int Src = M / Size;
42622 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42623 return false;
42624 ParitySrc[i % 2] = Src;
42625 }
42626
42627 // Make sure each input is used.
42628 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42629 return false;
42630
42631 Op0Even = ParitySrc[0] == 0;
42632 return true;
42633}
42634
42635/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42636/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42637/// are written to the parameters \p Opnd0 and \p Opnd1.
42638///
42639/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42640/// so it is easier to generically match. We also insert dummy vector shuffle
42641/// nodes for the operands which explicitly discard the lanes which are unused
42642/// by this operation to try to flow through the rest of the combiner
42643/// the fact that they're unused.
42644static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42645 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42646 bool &IsSubAdd) {
42647
42648 EVT VT = N->getValueType(0);
42649 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42650 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42652 return false;
42653
42654 // We only handle target-independent shuffles.
42655 // FIXME: It would be easy and harmless to use the target shuffle mask
42656 // extraction tool to support more.
42657 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42658 return false;
42659
42660 SDValue V1 = N->getOperand(0);
42661 SDValue V2 = N->getOperand(1);
42662
42663 // Make sure we have an FADD and an FSUB.
42664 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42665 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42666 V1.getOpcode() == V2.getOpcode())
42667 return false;
42668
42669 // If there are other uses of these operations we can't fold them.
42670 if (!V1->hasOneUse() || !V2->hasOneUse())
42671 return false;
42672
42673 // Ensure that both operations have the same operands. Note that we can
42674 // commute the FADD operands.
42675 SDValue LHS, RHS;
42676 if (V1.getOpcode() == ISD::FSUB) {
42677 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42678 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42679 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42680 return false;
42681 } else {
42682 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
42683 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42684 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42685 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42686 return false;
42687 }
42688
42689 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42690 bool Op0Even;
42691 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42692 return false;
42693
42694 // It's a subadd if the vector in the even parity is an FADD.
42695 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42696 : V2->getOpcode() == ISD::FADD;
42697
42698 Opnd0 = LHS;
42699 Opnd1 = RHS;
42700 return true;
42701}
42702
42703/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42705 const X86Subtarget &Subtarget,
42706 SelectionDAG &DAG) {
42707 // We only handle target-independent shuffles.
42708 // FIXME: It would be easy and harmless to use the target shuffle mask
42709 // extraction tool to support more.
42710 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42711 return SDValue();
42712
42713 MVT VT = N->getSimpleValueType(0);
42714 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42715 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42716 return SDValue();
42717
42718 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42719 SDValue Op0 = N->getOperand(0);
42720 SDValue Op1 = N->getOperand(1);
42721 SDValue FMAdd = Op0, FMSub = Op1;
42722 if (FMSub.getOpcode() != X86ISD::FMSUB)
42723 std::swap(FMAdd, FMSub);
42724
42725 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42726 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42727 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42728 FMAdd.getOperand(2) != FMSub.getOperand(2))
42729 return SDValue();
42730
42731 // Check for correct shuffle mask.
42732 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42733 bool Op0Even;
42734 if (!isAddSubOrSubAddMask(Mask, Op0Even))
42735 return SDValue();
42736
42737 // FMAddSub takes zeroth operand from FMSub node.
42738 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42739 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42740 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42741 FMAdd.getOperand(2));
42742}
42743
42744/// Try to combine a shuffle into a target-specific add-sub or
42745/// mul-add-sub node.
42747 const X86Subtarget &Subtarget,
42748 SelectionDAG &DAG) {
42749 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
42750 return V;
42751
42752 SDValue Opnd0, Opnd1;
42753 bool IsSubAdd;
42754 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42755 return SDValue();
42756
42757 MVT VT = N->getSimpleValueType(0);
42758
42759 // Try to generate X86ISD::FMADDSUB node here.
42760 SDValue Opnd2;
42761 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42762 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42763 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42764 }
42765
42766 if (IsSubAdd)
42767 return SDValue();
42768
42769 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42770 // the ADDSUB idiom has been successfully recognized. There are no known
42771 // X86 targets with 512-bit ADDSUB instructions!
42772 if (VT.is512BitVector())
42773 return SDValue();
42774
42775 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42776 // the ADDSUB idiom has been successfully recognized. There are no known
42777 // X86 targets with FP16 ADDSUB instructions!
42778 if (VT.getVectorElementType() == MVT::f16)
42779 return SDValue();
42780
42781 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42782}
42783
42784// We are looking for a shuffle where both sources are concatenated with undef
42785// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42786// if we can express this as a single-source shuffle, that's preferable.
42788 SelectionDAG &DAG,
42789 const X86Subtarget &Subtarget) {
42790 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42791 return SDValue();
42792
42793 EVT VT = N->getValueType(0);
42794
42795 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42796 if (!VT.is128BitVector() && !VT.is256BitVector())
42797 return SDValue();
42798
42799 if (VT.getVectorElementType() != MVT::i32 &&
42800 VT.getVectorElementType() != MVT::i64 &&
42801 VT.getVectorElementType() != MVT::f32 &&
42802 VT.getVectorElementType() != MVT::f64)
42803 return SDValue();
42804
42805 SDValue N0 = N->getOperand(0);
42806 SDValue N1 = N->getOperand(1);
42807
42808 // Check that both sources are concats with undef.
42809 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42810 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42811 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42812 !N1.getOperand(1).isUndef())
42813 return SDValue();
42814
42815 // Construct the new shuffle mask. Elements from the first source retain their
42816 // index, but elements from the second source no longer need to skip an undef.
42818 int NumElts = VT.getVectorNumElements();
42819
42820 auto *SVOp = cast<ShuffleVectorSDNode>(N);
42821 for (int Elt : SVOp->getMask())
42822 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42823
42825 N1.getOperand(0));
42826 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42827}
42828
42829/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42830/// low half of each source vector and does not set any high half elements in
42831/// the destination vector, narrow the shuffle to half its original size.
42833 EVT VT = Shuf->getValueType(0);
42834 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
42835 return SDValue();
42836 if (!VT.is256BitVector() && !VT.is512BitVector())
42837 return SDValue();
42838
42839 // See if we can ignore all of the high elements of the shuffle.
42840 ArrayRef<int> Mask = Shuf->getMask();
42841 if (!isUndefUpperHalf(Mask))
42842 return SDValue();
42843
42844 // Check if the shuffle mask accesses only the low half of each input vector
42845 // (half-index output is 0 or 2).
42846 int HalfIdx1, HalfIdx2;
42847 SmallVector<int, 8> HalfMask(Mask.size() / 2);
42848 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42849 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42850 return SDValue();
42851
42852 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42853 // The trick is knowing that all of the insert/extract are actually free
42854 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42855 // of narrow inputs into a narrow output, and that is always cheaper than
42856 // the wide shuffle that we started with.
42857 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42858 Shuf->getOperand(1), HalfMask, HalfIdx1,
42859 HalfIdx2, false, DAG, /*UseConcat*/ true);
42860}
42861
42864 const X86Subtarget &Subtarget) {
42865 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42866 if (SDValue V = narrowShuffle(Shuf, DAG))
42867 return V;
42868
42869 // If we have legalized the vector types, look for blends of FADD and FSUB
42870 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42871 SDLoc dl(N);
42872 EVT VT = N->getValueType(0);
42873 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42874 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
42875 if (SDValue AddSub =
42876 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
42877 return AddSub;
42878
42879 // Attempt to combine into a vector load/broadcast.
42881 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42882 return LD;
42883
42884 // For AVX2, we sometimes want to combine
42885 // (vector_shuffle <mask> (concat_vectors t1, undef)
42886 // (concat_vectors t2, undef))
42887 // Into:
42888 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42889 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42890 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
42891 return ShufConcat;
42892
42893 if (isTargetShuffle(N->getOpcode())) {
42894 SDValue Op(N, 0);
42895 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
42896 return Shuffle;
42897
42898 // Try recursively combining arbitrary sequences of x86 shuffle
42899 // instructions into higher-order shuffles. We do this after combining
42900 // specific PSHUF instruction sequences into their minimal form so that we
42901 // can evaluate how many specialized shuffle instructions are involved in
42902 // a particular chain.
42903 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
42904 return Res;
42905
42906 // Simplify source operands based on shuffle mask.
42907 // TODO - merge this into combineX86ShufflesRecursively.
42908 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
42909 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
42910 return SDValue(N, 0);
42911
42912 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42913 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42914 // Perform this after other shuffle combines to allow inner shuffles to be
42915 // combined away first.
42916 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
42917 return BinOp;
42918 }
42919
42920 return SDValue();
42921}
42922
42923// Simplify variable target shuffle masks based on the demanded elements.
42924// TODO: Handle DemandedBits in mask indices as well?
42926 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
42927 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
42928 // If we're demanding all elements don't bother trying to simplify the mask.
42929 unsigned NumElts = DemandedElts.getBitWidth();
42930 if (DemandedElts.isAllOnes())
42931 return false;
42932
42933 SDValue Mask = Op.getOperand(MaskIndex);
42934 if (!Mask.hasOneUse())
42935 return false;
42936
42937 // Attempt to generically simplify the variable shuffle mask.
42938 APInt MaskUndef, MaskZero;
42939 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
42940 Depth + 1))
42941 return true;
42942
42943 // Attempt to extract+simplify a (constant pool load) shuffle mask.
42944 // TODO: Support other types from getTargetShuffleMaskIndices?
42946 EVT BCVT = BC.getValueType();
42947 auto *Load = dyn_cast<LoadSDNode>(BC);
42948 if (!Load || !Load->getBasePtr().hasOneUse())
42949 return false;
42950
42951 const Constant *C = getTargetConstantFromNode(Load);
42952 if (!C)
42953 return false;
42954
42955 Type *CTy = C->getType();
42956 if (!CTy->isVectorTy() ||
42957 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
42958 return false;
42959
42960 // Handle scaling for i64 elements on 32-bit targets.
42961 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
42962 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
42963 return false;
42964 unsigned Scale = NumCstElts / NumElts;
42965
42966 // Simplify mask if we have an undemanded element that is not undef.
42967 bool Simplified = false;
42968 SmallVector<Constant *, 32> ConstVecOps;
42969 for (unsigned i = 0; i != NumCstElts; ++i) {
42970 Constant *Elt = C->getAggregateElement(i);
42971 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
42972 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
42973 Simplified = true;
42974 continue;
42975 }
42976 ConstVecOps.push_back(Elt);
42977 }
42978 if (!Simplified)
42979 return false;
42980
42981 // Generate new constant pool entry + legalize immediately for the load.
42982 SDLoc DL(Op);
42983 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
42984 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
42985 SDValue NewMask = TLO.DAG.getLoad(
42986 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
42988 Load->getAlign());
42989 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
42990}
42991
42993 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
42994 TargetLoweringOpt &TLO, unsigned Depth) const {
42995 int NumElts = DemandedElts.getBitWidth();
42996 unsigned Opc = Op.getOpcode();
42997 EVT VT = Op.getValueType();
42998
42999 // Handle special case opcodes.
43000 switch (Opc) {
43001 case X86ISD::PMULDQ:
43002 case X86ISD::PMULUDQ: {
43003 APInt LHSUndef, LHSZero;
43004 APInt RHSUndef, RHSZero;
43005 SDValue LHS = Op.getOperand(0);
43006 SDValue RHS = Op.getOperand(1);
43007 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43008 Depth + 1))
43009 return true;
43010 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43011 Depth + 1))
43012 return true;
43013 // Multiply by zero.
43014 KnownZero = LHSZero | RHSZero;
43015 break;
43016 }
43017 case X86ISD::VPMADDUBSW:
43018 case X86ISD::VPMADDWD: {
43019 APInt LHSUndef, LHSZero;
43020 APInt RHSUndef, RHSZero;
43021 SDValue LHS = Op.getOperand(0);
43022 SDValue RHS = Op.getOperand(1);
43023 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43024
43025 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43026 Depth + 1))
43027 return true;
43028 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43029 Depth + 1))
43030 return true;
43031
43032 // TODO: Multiply by zero.
43033
43034 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43035 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43036 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43037 Depth + 1))
43038 return true;
43039 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43040 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43041 Depth + 1))
43042 return true;
43043 break;
43044 }
43045 case X86ISD::PSADBW: {
43046 SDValue LHS = Op.getOperand(0);
43047 SDValue RHS = Op.getOperand(1);
43048 assert(VT.getScalarType() == MVT::i64 &&
43049 LHS.getValueType() == RHS.getValueType() &&
43050 LHS.getValueType().getScalarType() == MVT::i8 &&
43051 "Unexpected PSADBW types");
43052
43053 // Aggressively peek through ops to get at the demanded elts.
43054 if (!DemandedElts.isAllOnes()) {
43055 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43056 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43058 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43060 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43061 if (NewLHS || NewRHS) {
43062 NewLHS = NewLHS ? NewLHS : LHS;
43063 NewRHS = NewRHS ? NewRHS : RHS;
43064 return TLO.CombineTo(
43065 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43066 }
43067 }
43068 break;
43069 }
43070 case X86ISD::VSHL:
43071 case X86ISD::VSRL:
43072 case X86ISD::VSRA: {
43073 // We only need the bottom 64-bits of the (128-bit) shift amount.
43074 SDValue Amt = Op.getOperand(1);
43075 MVT AmtVT = Amt.getSimpleValueType();
43076 assert(AmtVT.is128BitVector() && "Unexpected value type");
43077
43078 // If we reuse the shift amount just for sse shift amounts then we know that
43079 // only the bottom 64-bits are only ever used.
43080 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43081 unsigned UseOpc = Use->getOpcode();
43082 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43083 UseOpc == X86ISD::VSRA) &&
43084 Use->getOperand(0) != Amt;
43085 });
43086
43087 APInt AmtUndef, AmtZero;
43088 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43089 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43090 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43091 Depth + 1, AssumeSingleUse))
43092 return true;
43093 [[fallthrough]];
43094 }
43095 case X86ISD::VSHLI:
43096 case X86ISD::VSRLI:
43097 case X86ISD::VSRAI: {
43098 SDValue Src = Op.getOperand(0);
43099 APInt SrcUndef;
43100 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43101 Depth + 1))
43102 return true;
43103
43104 // Fold shift(0,x) -> 0
43105 if (DemandedElts.isSubsetOf(KnownZero))
43106 return TLO.CombineTo(
43107 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43108
43109 // Aggressively peek through ops to get at the demanded elts.
43110 if (!DemandedElts.isAllOnes())
43112 Src, DemandedElts, TLO.DAG, Depth + 1))
43113 return TLO.CombineTo(
43114 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43115 break;
43116 }
43117 case X86ISD::VPSHA:
43118 case X86ISD::VPSHL:
43119 case X86ISD::VSHLV:
43120 case X86ISD::VSRLV:
43121 case X86ISD::VSRAV: {
43122 APInt LHSUndef, LHSZero;
43123 APInt RHSUndef, RHSZero;
43124 SDValue LHS = Op.getOperand(0);
43125 SDValue RHS = Op.getOperand(1);
43126 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43127 Depth + 1))
43128 return true;
43129
43130 // Fold shift(0,x) -> 0
43131 if (DemandedElts.isSubsetOf(LHSZero))
43132 return TLO.CombineTo(
43133 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43134
43135 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43136 Depth + 1))
43137 return true;
43138
43139 KnownZero = LHSZero;
43140 break;
43141 }
43142 case X86ISD::PCMPEQ:
43143 case X86ISD::PCMPGT: {
43144 APInt LHSUndef, LHSZero;
43145 APInt RHSUndef, RHSZero;
43146 SDValue LHS = Op.getOperand(0);
43147 SDValue RHS = Op.getOperand(1);
43148 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43149 Depth + 1))
43150 return true;
43151 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43152 Depth + 1))
43153 return true;
43154 break;
43155 }
43156 case X86ISD::KSHIFTL: {
43157 SDValue Src = Op.getOperand(0);
43158 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43159 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43160 unsigned ShiftAmt = Amt->getZExtValue();
43161
43162 if (ShiftAmt == 0)
43163 return TLO.CombineTo(Op, Src);
43164
43165 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43166 // single shift. We can do this if the bottom bits (which are shifted
43167 // out) are never demanded.
43168 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43169 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43170 unsigned C1 = Src.getConstantOperandVal(1);
43171 unsigned NewOpc = X86ISD::KSHIFTL;
43172 int Diff = ShiftAmt - C1;
43173 if (Diff < 0) {
43174 Diff = -Diff;
43175 NewOpc = X86ISD::KSHIFTR;
43176 }
43177
43178 SDLoc dl(Op);
43179 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43180 return TLO.CombineTo(
43181 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43182 }
43183 }
43184
43185 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43186 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43187 Depth + 1))
43188 return true;
43189
43190 KnownUndef <<= ShiftAmt;
43191 KnownZero <<= ShiftAmt;
43192 KnownZero.setLowBits(ShiftAmt);
43193 break;
43194 }
43195 case X86ISD::KSHIFTR: {
43196 SDValue Src = Op.getOperand(0);
43197 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43198 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43199 unsigned ShiftAmt = Amt->getZExtValue();
43200
43201 if (ShiftAmt == 0)
43202 return TLO.CombineTo(Op, Src);
43203
43204 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43205 // single shift. We can do this if the top bits (which are shifted
43206 // out) are never demanded.
43207 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43208 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43209 unsigned C1 = Src.getConstantOperandVal(1);
43210 unsigned NewOpc = X86ISD::KSHIFTR;
43211 int Diff = ShiftAmt - C1;
43212 if (Diff < 0) {
43213 Diff = -Diff;
43214 NewOpc = X86ISD::KSHIFTL;
43215 }
43216
43217 SDLoc dl(Op);
43218 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43219 return TLO.CombineTo(
43220 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43221 }
43222 }
43223
43224 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43225 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43226 Depth + 1))
43227 return true;
43228
43229 KnownUndef.lshrInPlace(ShiftAmt);
43230 KnownZero.lshrInPlace(ShiftAmt);
43231 KnownZero.setHighBits(ShiftAmt);
43232 break;
43233 }
43234 case X86ISD::ANDNP: {
43235 // ANDNP = (~LHS & RHS);
43236 SDValue LHS = Op.getOperand(0);
43237 SDValue RHS = Op.getOperand(1);
43238
43239 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43240 APInt UndefElts;
43241 SmallVector<APInt> EltBits;
43242 int NumElts = VT.getVectorNumElements();
43243 int EltSizeInBits = VT.getScalarSizeInBits();
43244 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43245 APInt OpElts = DemandedElts;
43246 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43247 EltBits)) {
43248 OpBits.clearAllBits();
43249 OpElts.clearAllBits();
43250 for (int I = 0; I != NumElts; ++I) {
43251 if (!DemandedElts[I])
43252 continue;
43253 if (UndefElts[I]) {
43254 // We can't assume an undef src element gives an undef dst - the
43255 // other src might be zero.
43256 OpBits.setAllBits();
43257 OpElts.setBit(I);
43258 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43259 (!Invert && !EltBits[I].isZero())) {
43260 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43261 OpElts.setBit(I);
43262 }
43263 }
43264 }
43265 return std::make_pair(OpBits, OpElts);
43266 };
43267 APInt BitsLHS, EltsLHS;
43268 APInt BitsRHS, EltsRHS;
43269 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43270 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43271
43272 APInt LHSUndef, LHSZero;
43273 APInt RHSUndef, RHSZero;
43274 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43275 Depth + 1))
43276 return true;
43277 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43278 Depth + 1))
43279 return true;
43280
43281 if (!DemandedElts.isAllOnes()) {
43282 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43283 TLO.DAG, Depth + 1);
43284 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43285 TLO.DAG, Depth + 1);
43286 if (NewLHS || NewRHS) {
43287 NewLHS = NewLHS ? NewLHS : LHS;
43288 NewRHS = NewRHS ? NewRHS : RHS;
43289 return TLO.CombineTo(
43290 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43291 }
43292 }
43293 break;
43294 }
43295 case X86ISD::CVTSI2P:
43296 case X86ISD::CVTUI2P:
43297 case X86ISD::CVTPH2PS:
43298 case X86ISD::CVTPS2PH: {
43299 SDValue Src = Op.getOperand(0);
43300 EVT SrcVT = Src.getValueType();
43301 APInt SrcUndef, SrcZero;
43302 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43303 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43304 Depth + 1))
43305 return true;
43306 break;
43307 }
43308 case X86ISD::PACKSS:
43309 case X86ISD::PACKUS: {
43310 SDValue N0 = Op.getOperand(0);
43311 SDValue N1 = Op.getOperand(1);
43312
43313 APInt DemandedLHS, DemandedRHS;
43314 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43315
43316 APInt LHSUndef, LHSZero;
43317 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43318 Depth + 1))
43319 return true;
43320 APInt RHSUndef, RHSZero;
43321 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43322 Depth + 1))
43323 return true;
43324
43325 // TODO - pass on known zero/undef.
43326
43327 // Aggressively peek through ops to get at the demanded elts.
43328 // TODO - we should do this for all target/faux shuffles ops.
43329 if (!DemandedElts.isAllOnes()) {
43330 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43331 TLO.DAG, Depth + 1);
43332 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43333 TLO.DAG, Depth + 1);
43334 if (NewN0 || NewN1) {
43335 NewN0 = NewN0 ? NewN0 : N0;
43336 NewN1 = NewN1 ? NewN1 : N1;
43337 return TLO.CombineTo(Op,
43338 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43339 }
43340 }
43341 break;
43342 }
43343 case X86ISD::HADD:
43344 case X86ISD::HSUB:
43345 case X86ISD::FHADD:
43346 case X86ISD::FHSUB: {
43347 SDValue N0 = Op.getOperand(0);
43348 SDValue N1 = Op.getOperand(1);
43349
43350 APInt DemandedLHS, DemandedRHS;
43351 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43352
43353 APInt LHSUndef, LHSZero;
43354 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43355 Depth + 1))
43356 return true;
43357 APInt RHSUndef, RHSZero;
43358 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43359 Depth + 1))
43360 return true;
43361
43362 // TODO - pass on known zero/undef.
43363
43364 // Aggressively peek through ops to get at the demanded elts.
43365 // TODO: Handle repeated operands.
43366 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43367 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43368 TLO.DAG, Depth + 1);
43369 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43370 TLO.DAG, Depth + 1);
43371 if (NewN0 || NewN1) {
43372 NewN0 = NewN0 ? NewN0 : N0;
43373 NewN1 = NewN1 ? NewN1 : N1;
43374 return TLO.CombineTo(Op,
43375 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43376 }
43377 }
43378 break;
43379 }
43380 case X86ISD::VTRUNC:
43381 case X86ISD::VTRUNCS:
43382 case X86ISD::VTRUNCUS: {
43383 SDValue Src = Op.getOperand(0);
43384 MVT SrcVT = Src.getSimpleValueType();
43385 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43386 APInt SrcUndef, SrcZero;
43387 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43388 Depth + 1))
43389 return true;
43390 KnownZero = SrcZero.zextOrTrunc(NumElts);
43391 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43392 break;
43393 }
43394 case X86ISD::BLENDI: {
43395 SmallVector<int, 16> BlendMask;
43396 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43398 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43399 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43400 return TLO.CombineTo(Op, R);
43401 break;
43402 }
43403 case X86ISD::BLENDV: {
43404 APInt SelUndef, SelZero;
43405 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43406 SelZero, TLO, Depth + 1))
43407 return true;
43408
43409 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43410 APInt LHSUndef, LHSZero;
43411 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43412 LHSZero, TLO, Depth + 1))
43413 return true;
43414
43415 APInt RHSUndef, RHSZero;
43416 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43417 RHSZero, TLO, Depth + 1))
43418 return true;
43419
43420 KnownZero = LHSZero & RHSZero;
43421 KnownUndef = LHSUndef & RHSUndef;
43422 break;
43423 }
43424 case X86ISD::VZEXT_MOVL: {
43425 // If upper demanded elements are already zero then we have nothing to do.
43426 SDValue Src = Op.getOperand(0);
43427 APInt DemandedUpperElts = DemandedElts;
43428 DemandedUpperElts.clearLowBits(1);
43429 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43430 return TLO.CombineTo(Op, Src);
43431 break;
43432 }
43433 case X86ISD::VZEXT_LOAD: {
43434 // If upper demanded elements are not demanded then simplify to a
43435 // scalar_to_vector(load()).
43437 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43438 SDLoc DL(Op);
43439 auto *Mem = cast<MemSDNode>(Op);
43440 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43441 Mem->getMemOperand());
43442 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43443 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43444 }
43445 break;
43446 }
43447 case X86ISD::VBROADCAST: {
43448 SDValue Src = Op.getOperand(0);
43449 MVT SrcVT = Src.getSimpleValueType();
43450 // Don't bother broadcasting if we just need the 0'th element.
43451 if (DemandedElts == 1) {
43452 if (!SrcVT.isVector())
43453 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43454 else if (Src.getValueType() != VT)
43455 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43456 SDLoc(Op));
43457 return TLO.CombineTo(Op, Src);
43458 }
43459 if (!SrcVT.isVector())
43460 break;
43461 APInt SrcUndef, SrcZero;
43462 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43463 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43464 Depth + 1))
43465 return true;
43466 // Aggressively peek through src to get at the demanded elt.
43467 // TODO - we should do this for all target/faux shuffles ops.
43469 Src, SrcElts, TLO.DAG, Depth + 1))
43470 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43471 break;
43472 }
43473 case X86ISD::VPERMV:
43474 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43475 Depth))
43476 return true;
43477 break;
43478 case X86ISD::PSHUFB:
43479 case X86ISD::VPERMV3:
43480 case X86ISD::VPERMILPV:
43481 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43482 Depth))
43483 return true;
43484 break;
43485 case X86ISD::VPPERM:
43486 case X86ISD::VPERMIL2:
43487 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43488 Depth))
43489 return true;
43490 break;
43491 }
43492
43493 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43494 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43495 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43496 if ((VT.is256BitVector() || VT.is512BitVector()) &&
43497 DemandedElts.lshr(NumElts / 2) == 0) {
43498 unsigned SizeInBits = VT.getSizeInBits();
43499 unsigned ExtSizeInBits = SizeInBits / 2;
43500
43501 // See if 512-bit ops only use the bottom 128-bits.
43502 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43503 ExtSizeInBits = SizeInBits / 4;
43504
43505 switch (Opc) {
43506 // Scalar broadcast.
43507 case X86ISD::VBROADCAST: {
43508 SDLoc DL(Op);
43509 SDValue Src = Op.getOperand(0);
43510 if (Src.getValueSizeInBits() > ExtSizeInBits)
43511 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43512 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43513 ExtSizeInBits / VT.getScalarSizeInBits());
43514 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43515 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43516 TLO.DAG, DL, ExtSizeInBits));
43517 }
43519 SDLoc DL(Op);
43520 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43521 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43522 ExtSizeInBits / VT.getScalarSizeInBits());
43523 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43524 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43525 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43526 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43527 MemIntr->getMemOperand());
43529 Bcst.getValue(1));
43530 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43531 TLO.DAG, DL, ExtSizeInBits));
43532 }
43533 // Subvector broadcast.
43535 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43536 EVT MemVT = MemIntr->getMemoryVT();
43537 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43538 SDLoc DL(Op);
43539 SDValue Ld =
43540 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43541 MemIntr->getBasePtr(), MemIntr->getMemOperand());
43543 Ld.getValue(1));
43544 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43545 TLO.DAG, DL, ExtSizeInBits));
43546 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43547 SDLoc DL(Op);
43548 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43549 ExtSizeInBits / VT.getScalarSizeInBits());
43550 if (SDValue BcstLd =
43551 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43552 return TLO.CombineTo(Op,
43553 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43554 TLO.DAG, DL, ExtSizeInBits));
43555 }
43556 break;
43557 }
43558 // Byte shifts by immediate.
43559 case X86ISD::VSHLDQ:
43560 case X86ISD::VSRLDQ:
43561 // Shift by uniform.
43562 case X86ISD::VSHL:
43563 case X86ISD::VSRL:
43564 case X86ISD::VSRA:
43565 // Shift by immediate.
43566 case X86ISD::VSHLI:
43567 case X86ISD::VSRLI:
43568 case X86ISD::VSRAI: {
43569 SDLoc DL(Op);
43570 SDValue Ext0 =
43571 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43572 SDValue ExtOp =
43573 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43574 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43575 SDValue Insert =
43576 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43577 return TLO.CombineTo(Op, Insert);
43578 }
43579 case X86ISD::VPERMI: {
43580 // Simplify PERMPD/PERMQ to extract_subvector.
43581 // TODO: This should be done in shuffle combining.
43582 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43584 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43585 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43586 SDLoc DL(Op);
43587 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43588 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43589 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43590 return TLO.CombineTo(Op, Insert);
43591 }
43592 }
43593 break;
43594 }
43595 case X86ISD::VPERM2X128: {
43596 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43597 SDLoc DL(Op);
43598 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43599 if (LoMask & 0x8)
43600 return TLO.CombineTo(
43601 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43602 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43603 unsigned SrcIdx = (LoMask & 0x2) >> 1;
43604 SDValue ExtOp =
43605 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43606 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43607 SDValue Insert =
43608 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43609 return TLO.CombineTo(Op, Insert);
43610 }
43611 // Conversions.
43612 // TODO: Add more CVT opcodes when we have test coverage.
43613 case X86ISD::CVTTP2SI:
43614 case X86ISD::CVTTP2UI:
43615 case X86ISD::CVTPH2PS: {
43616 SDLoc DL(Op);
43617 unsigned Scale = SizeInBits / ExtSizeInBits;
43618 SDValue SrcOp = Op.getOperand(0);
43619 MVT SrcVT = SrcOp.getSimpleValueType();
43620 unsigned SrcExtSize =
43621 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
43623 ExtSizeInBits / VT.getScalarSizeInBits());
43624 SDValue ExtOp = TLO.DAG.getNode(
43625 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
43626 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43627 SDValue Insert =
43628 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43629 return TLO.CombineTo(Op, Insert);
43630 }
43631 // Zero upper elements.
43632 case X86ISD::VZEXT_MOVL:
43633 // Variable blend.
43634 case X86ISD::BLENDV:
43635 // Target unary shuffles by immediate:
43636 case X86ISD::PSHUFD:
43637 case X86ISD::PSHUFLW:
43638 case X86ISD::PSHUFHW:
43639 case X86ISD::VPERMILPI:
43640 // (Non-Lane Crossing) Target Shuffles.
43641 case X86ISD::VPERMILPV:
43642 case X86ISD::VPERMIL2:
43643 case X86ISD::PSHUFB:
43644 case X86ISD::UNPCKL:
43645 case X86ISD::UNPCKH:
43646 case X86ISD::BLENDI:
43647 // Integer ops.
43648 case X86ISD::PACKSS:
43649 case X86ISD::PACKUS:
43650 case X86ISD::PCMPEQ:
43651 case X86ISD::PCMPGT:
43652 case X86ISD::PMULUDQ:
43653 case X86ISD::PMULDQ:
43654 case X86ISD::VSHLV:
43655 case X86ISD::VSRLV:
43656 case X86ISD::VSRAV:
43657 // Float ops.
43658 case X86ISD::FMAX:
43659 case X86ISD::FMIN:
43660 case X86ISD::FMAXC:
43661 case X86ISD::FMINC:
43662 case X86ISD::FRSQRT:
43663 case X86ISD::FRCP:
43664 // Horizontal Ops.
43665 case X86ISD::HADD:
43666 case X86ISD::HSUB:
43667 case X86ISD::FHADD:
43668 case X86ISD::FHSUB: {
43669 SDLoc DL(Op);
43671 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43672 SDValue SrcOp = Op.getOperand(i);
43673 EVT SrcVT = SrcOp.getValueType();
43674 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
43675 "Unsupported vector size");
43676 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43677 ExtSizeInBits)
43678 : SrcOp);
43679 }
43680 MVT ExtVT = VT.getSimpleVT();
43681 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43682 ExtSizeInBits / ExtVT.getScalarSizeInBits());
43683 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43684 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43685 SDValue Insert =
43686 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43687 return TLO.CombineTo(Op, Insert);
43688 }
43689 }
43690 }
43691
43692 // For splats, unless we *only* demand the 0'th element,
43693 // stop attempts at simplification here, we aren't going to improve things,
43694 // this is better than any potential shuffle.
43695 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43696 return false;
43697
43698 // Get target/faux shuffle mask.
43699 APInt OpUndef, OpZero;
43700 SmallVector<int, 64> OpMask;
43701 SmallVector<SDValue, 2> OpInputs;
43702 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43703 OpZero, TLO.DAG, Depth, false))
43704 return false;
43705
43706 // Shuffle inputs must be the same size as the result.
43707 if (OpMask.size() != (unsigned)NumElts ||
43708 llvm::any_of(OpInputs, [VT](SDValue V) {
43709 return VT.getSizeInBits() != V.getValueSizeInBits() ||
43710 !V.getValueType().isVector();
43711 }))
43712 return false;
43713
43714 KnownZero = OpZero;
43715 KnownUndef = OpUndef;
43716
43717 // Check if shuffle mask can be simplified to undef/zero/identity.
43718 int NumSrcs = OpInputs.size();
43719 for (int i = 0; i != NumElts; ++i)
43720 if (!DemandedElts[i])
43721 OpMask[i] = SM_SentinelUndef;
43722
43723 if (isUndefInRange(OpMask, 0, NumElts)) {
43724 KnownUndef.setAllBits();
43725 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43726 }
43727 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43728 KnownZero.setAllBits();
43729 return TLO.CombineTo(
43730 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43731 }
43732 for (int Src = 0; Src != NumSrcs; ++Src)
43733 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43734 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43735
43736 // Attempt to simplify inputs.
43737 for (int Src = 0; Src != NumSrcs; ++Src) {
43738 // TODO: Support inputs of different types.
43739 if (OpInputs[Src].getValueType() != VT)
43740 continue;
43741
43742 int Lo = Src * NumElts;
43743 APInt SrcElts = APInt::getZero(NumElts);
43744 for (int i = 0; i != NumElts; ++i)
43745 if (DemandedElts[i]) {
43746 int M = OpMask[i] - Lo;
43747 if (0 <= M && M < NumElts)
43748 SrcElts.setBit(M);
43749 }
43750
43751 // TODO - Propagate input undef/zero elts.
43752 APInt SrcUndef, SrcZero;
43753 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43754 TLO, Depth + 1))
43755 return true;
43756 }
43757
43758 // If we don't demand all elements, then attempt to combine to a simpler
43759 // shuffle.
43760 // We need to convert the depth to something combineX86ShufflesRecursively
43761 // can handle - so pretend its Depth == 0 again, and reduce the max depth
43762 // to match. This prevents combineX86ShuffleChain from returning a
43763 // combined shuffle that's the same as the original root, causing an
43764 // infinite loop.
43765 if (!DemandedElts.isAllOnes()) {
43766 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
43767
43768 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43769 for (int i = 0; i != NumElts; ++i)
43770 if (DemandedElts[i])
43771 DemandedMask[i] = i;
43772
43774 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43775 /*HasVarMask*/ false,
43776 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43777 Subtarget);
43778 if (NewShuffle)
43779 return TLO.CombineTo(Op, NewShuffle);
43780 }
43781
43782 return false;
43783}
43784
43786 SDValue Op, const APInt &OriginalDemandedBits,
43787 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43788 unsigned Depth) const {
43789 EVT VT = Op.getValueType();
43790 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43791 unsigned Opc = Op.getOpcode();
43792 switch(Opc) {
43793 case X86ISD::VTRUNC: {
43794 KnownBits KnownOp;
43795 SDValue Src = Op.getOperand(0);
43796 MVT SrcVT = Src.getSimpleValueType();
43797
43798 // Simplify the input, using demanded bit information.
43799 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43800 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43801 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43802 return true;
43803 break;
43804 }
43805 case X86ISD::PMULDQ:
43806 case X86ISD::PMULUDQ: {
43807 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43808 KnownBits KnownLHS, KnownRHS;
43809 SDValue LHS = Op.getOperand(0);
43810 SDValue RHS = Op.getOperand(1);
43811
43812 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43813 // FIXME: Can we bound this better?
43814 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43815 APInt DemandedMaskLHS = APInt::getAllOnes(64);
43816 APInt DemandedMaskRHS = APInt::getAllOnes(64);
43817
43818 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43819 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43820 DemandedMaskLHS = DemandedMask;
43821 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43822 DemandedMaskRHS = DemandedMask;
43823
43824 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43825 KnownLHS, TLO, Depth + 1))
43826 return true;
43827 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43828 KnownRHS, TLO, Depth + 1))
43829 return true;
43830
43831 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43832 KnownRHS = KnownRHS.trunc(32);
43833 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43834 KnownRHS.getConstant().isOne()) {
43835 SDLoc DL(Op);
43836 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43837 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43838 }
43839
43840 // Aggressively peek through ops to get at the demanded low bits.
43842 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43844 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43845 if (DemandedLHS || DemandedRHS) {
43846 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43847 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43848 return TLO.CombineTo(
43849 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43850 }
43851 break;
43852 }
43853 case X86ISD::ANDNP: {
43854 KnownBits Known2;
43855 SDValue Op0 = Op.getOperand(0);
43856 SDValue Op1 = Op.getOperand(1);
43857
43858 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
43859 Known, TLO, Depth + 1))
43860 return true;
43861
43862 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
43863 OriginalDemandedElts, Known2, TLO, Depth + 1))
43864 return true;
43865
43866 // If the RHS is a constant, see if we can simplify it.
43867 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
43868 OriginalDemandedElts, TLO))
43869 return true;
43870
43871 // ANDNP = (~Op0 & Op1);
43872 Known.One &= Known2.Zero;
43873 Known.Zero |= Known2.One;
43874 break;
43875 }
43876 case X86ISD::VSHLI: {
43877 SDValue Op0 = Op.getOperand(0);
43878 SDValue Op1 = Op.getOperand(1);
43879
43880 unsigned ShAmt = Op1->getAsZExtVal();
43881 if (ShAmt >= BitWidth)
43882 break;
43883
43884 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43885
43886 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43887 // single shift. We can do this if the bottom bits (which are shifted
43888 // out) are never demanded.
43889 if (Op0.getOpcode() == X86ISD::VSRLI &&
43890 OriginalDemandedBits.countr_zero() >= ShAmt) {
43891 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43892 if (Shift2Amt < BitWidth) {
43893 int Diff = ShAmt - Shift2Amt;
43894 if (Diff == 0)
43895 return TLO.CombineTo(Op, Op0.getOperand(0));
43896
43897 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43898 SDValue NewShift = TLO.DAG.getNode(
43899 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43900 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43901 return TLO.CombineTo(Op, NewShift);
43902 }
43903 }
43904
43905 // If we are only demanding sign bits then we can use the shift source directly.
43906 unsigned NumSignBits =
43907 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43908 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43909 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43910 return TLO.CombineTo(Op, Op0);
43911
43912 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43913 TLO, Depth + 1))
43914 return true;
43915
43916 Known.Zero <<= ShAmt;
43917 Known.One <<= ShAmt;
43918
43919 // Low bits known zero.
43920 Known.Zero.setLowBits(ShAmt);
43921
43922 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
43923 // Attempt to avoid multi-use ops if we don't need anything from them.
43924 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43925 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43926 SDValue NewOp =
43927 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
43928 return TLO.CombineTo(Op, NewOp);
43929 }
43930 }
43931 return false;
43932 }
43933 case X86ISD::VSRLI: {
43934 SDValue Op0 = Op.getOperand(0);
43935 SDValue Op1 = Op.getOperand(1);
43936
43937 unsigned ShAmt = Op1->getAsZExtVal();
43938 if (ShAmt >= BitWidth)
43939 break;
43940
43941 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43942
43943 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43944 TLO, Depth + 1))
43945 return true;
43946
43947 Known.Zero.lshrInPlace(ShAmt);
43948 Known.One.lshrInPlace(ShAmt);
43949
43950 // High bits known zero.
43951 Known.Zero.setHighBits(ShAmt);
43952
43953 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
43954 // Attempt to avoid multi-use ops if we don't need anything from them.
43955 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
43956 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
43957 SDValue NewOp =
43958 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
43959 return TLO.CombineTo(Op, NewOp);
43960 }
43961 }
43962 return false;
43963 }
43964 case X86ISD::VSRAI: {
43965 SDValue Op0 = Op.getOperand(0);
43966 SDValue Op1 = Op.getOperand(1);
43967
43968 unsigned ShAmt = Op1->getAsZExtVal();
43969 if (ShAmt >= BitWidth)
43970 break;
43971
43972 APInt DemandedMask = OriginalDemandedBits << ShAmt;
43973
43974 // If we just want the sign bit then we don't need to shift it.
43975 if (OriginalDemandedBits.isSignMask())
43976 return TLO.CombineTo(Op, Op0);
43977
43978 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43979 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
43980 SDValue Op00 = Op0.getOperand(0);
43981 unsigned NumSignBits =
43982 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
43983 if (ShAmt < NumSignBits)
43984 return TLO.CombineTo(Op, Op00);
43985 }
43986
43987 // If any of the demanded bits are produced by the sign extension, we also
43988 // demand the input sign bit.
43989 if (OriginalDemandedBits.countl_zero() < ShAmt)
43990 DemandedMask.setSignBit();
43991
43992 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43993 TLO, Depth + 1))
43994 return true;
43995
43996 Known.Zero.lshrInPlace(ShAmt);
43997 Known.One.lshrInPlace(ShAmt);
43998
43999 // If the input sign bit is known to be zero, or if none of the top bits
44000 // are demanded, turn this into an unsigned shift right.
44001 if (Known.Zero[BitWidth - ShAmt - 1] ||
44002 OriginalDemandedBits.countl_zero() >= ShAmt)
44003 return TLO.CombineTo(
44004 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44005
44006 // High bits are known one.
44007 if (Known.One[BitWidth - ShAmt - 1])
44008 Known.One.setHighBits(ShAmt);
44009
44010 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44011 // Attempt to avoid multi-use ops if we don't need anything from them.
44012 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44013 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44014 SDValue NewOp =
44015 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44016 return TLO.CombineTo(Op, NewOp);
44017 }
44018 }
44019 return false;
44020 }
44021 case X86ISD::BLENDV: {
44022 SDValue Sel = Op.getOperand(0);
44023 SDValue LHS = Op.getOperand(1);
44024 SDValue RHS = Op.getOperand(2);
44025
44026 APInt SignMask = APInt::getSignMask(BitWidth);
44028 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44030 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44032 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44033
44034 if (NewSel || NewLHS || NewRHS) {
44035 NewSel = NewSel ? NewSel : Sel;
44036 NewLHS = NewLHS ? NewLHS : LHS;
44037 NewRHS = NewRHS ? NewRHS : RHS;
44038 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44039 NewSel, NewLHS, NewRHS));
44040 }
44041 break;
44042 }
44043 case X86ISD::PEXTRB:
44044 case X86ISD::PEXTRW: {
44045 SDValue Vec = Op.getOperand(0);
44046 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44047 MVT VecVT = Vec.getSimpleValueType();
44048 unsigned NumVecElts = VecVT.getVectorNumElements();
44049
44050 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44051 unsigned Idx = CIdx->getZExtValue();
44052 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44053
44054 // If we demand no bits from the vector then we must have demanded
44055 // bits from the implict zext - simplify to zero.
44056 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44057 if (DemandedVecBits == 0)
44058 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44059
44060 APInt KnownUndef, KnownZero;
44061 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44062 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44063 KnownZero, TLO, Depth + 1))
44064 return true;
44065
44066 KnownBits KnownVec;
44067 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44068 KnownVec, TLO, Depth + 1))
44069 return true;
44070
44072 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44073 return TLO.CombineTo(
44074 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44075
44076 Known = KnownVec.zext(BitWidth);
44077 return false;
44078 }
44079 break;
44080 }
44081 case X86ISD::PINSRB:
44082 case X86ISD::PINSRW: {
44083 SDValue Vec = Op.getOperand(0);
44084 SDValue Scl = Op.getOperand(1);
44085 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44086 MVT VecVT = Vec.getSimpleValueType();
44087
44088 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44089 unsigned Idx = CIdx->getZExtValue();
44090 if (!OriginalDemandedElts[Idx])
44091 return TLO.CombineTo(Op, Vec);
44092
44093 KnownBits KnownVec;
44094 APInt DemandedVecElts(OriginalDemandedElts);
44095 DemandedVecElts.clearBit(Idx);
44096 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44097 KnownVec, TLO, Depth + 1))
44098 return true;
44099
44100 KnownBits KnownScl;
44101 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44102 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44103 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44104 return true;
44105
44106 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44107 Known = KnownVec.intersectWith(KnownScl);
44108 return false;
44109 }
44110 break;
44111 }
44112 case X86ISD::PACKSS:
44113 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44114 // sign bit then we can just ask for the source operands sign bit.
44115 // TODO - add known bits handling.
44116 if (OriginalDemandedBits.isSignMask()) {
44117 APInt DemandedLHS, DemandedRHS;
44118 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44119
44120 KnownBits KnownLHS, KnownRHS;
44121 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44122 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44123 KnownLHS, TLO, Depth + 1))
44124 return true;
44125 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44126 KnownRHS, TLO, Depth + 1))
44127 return true;
44128
44129 // Attempt to avoid multi-use ops if we don't need anything from them.
44131 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44133 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44134 if (DemandedOp0 || DemandedOp1) {
44135 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44136 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44137 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44138 }
44139 }
44140 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44141 break;
44142 case X86ISD::VBROADCAST: {
44143 SDValue Src = Op.getOperand(0);
44144 MVT SrcVT = Src.getSimpleValueType();
44145 APInt DemandedElts = APInt::getOneBitSet(
44146 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44147 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44148 TLO, Depth + 1))
44149 return true;
44150 // If we don't need the upper bits, attempt to narrow the broadcast source.
44151 // Don't attempt this on AVX512 as it might affect broadcast folding.
44152 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44153 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44154 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44155 Src->hasOneUse()) {
44156 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44157 SDValue NewSrc =
44158 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44159 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44160 SDValue NewBcst =
44161 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44162 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44163 }
44164 break;
44165 }
44166 case X86ISD::PCMPGT:
44167 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44168 // iff we only need the sign bit then we can use R directly.
44169 if (OriginalDemandedBits.isSignMask() &&
44170 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44171 return TLO.CombineTo(Op, Op.getOperand(1));
44172 break;
44173 case X86ISD::MOVMSK: {
44174 SDValue Src = Op.getOperand(0);
44175 MVT SrcVT = Src.getSimpleValueType();
44176 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44177 unsigned NumElts = SrcVT.getVectorNumElements();
44178
44179 // If we don't need the sign bits at all just return zero.
44180 if (OriginalDemandedBits.countr_zero() >= NumElts)
44181 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44182
44183 // See if we only demand bits from the lower 128-bit vector.
44184 if (SrcVT.is256BitVector() &&
44185 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44186 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44187 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44188 }
44189
44190 // Only demand the vector elements of the sign bits we need.
44191 APInt KnownUndef, KnownZero;
44192 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44193 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44194 TLO, Depth + 1))
44195 return true;
44196
44197 Known.Zero = KnownZero.zext(BitWidth);
44198 Known.Zero.setHighBits(BitWidth - NumElts);
44199
44200 // MOVMSK only uses the MSB from each vector element.
44201 KnownBits KnownSrc;
44202 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44203 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44204 Depth + 1))
44205 return true;
44206
44207 if (KnownSrc.One[SrcBits - 1])
44208 Known.One.setLowBits(NumElts);
44209 else if (KnownSrc.Zero[SrcBits - 1])
44210 Known.Zero.setLowBits(NumElts);
44211
44212 // Attempt to avoid multi-use os if we don't need anything from it.
44214 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44215 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44216 return false;
44217 }
44218 case X86ISD::TESTP: {
44219 SDValue Op0 = Op.getOperand(0);
44220 SDValue Op1 = Op.getOperand(1);
44221 MVT OpVT = Op0.getSimpleValueType();
44222 assert((OpVT.getVectorElementType() == MVT::f32 ||
44223 OpVT.getVectorElementType() == MVT::f64) &&
44224 "Illegal vector type for X86ISD::TESTP");
44225
44226 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44227 KnownBits KnownSrc;
44228 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44229 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44230 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44231 AssumeSingleUse) ||
44232 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44233 AssumeSingleUse);
44234 }
44235 case X86ISD::CMOV: {
44236 KnownBits Known2;
44237 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44238 OriginalDemandedElts, Known2, TLO, Depth + 1))
44239 return true;
44240 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44241 OriginalDemandedElts, Known, TLO, Depth + 1))
44242 return true;
44243
44244 // Only known if known in both the LHS and RHS.
44245 Known = Known.intersectWith(Known2);
44246 break;
44247 }
44248 case X86ISD::BEXTR:
44249 case X86ISD::BEXTRI: {
44250 SDValue Op0 = Op.getOperand(0);
44251 SDValue Op1 = Op.getOperand(1);
44252
44253 // Only bottom 16-bits of the control bits are required.
44254 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44255 // NOTE: SimplifyDemandedBits won't do this for constants.
44256 uint64_t Val1 = Cst1->getZExtValue();
44257 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44258 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44259 SDLoc DL(Op);
44260 return TLO.CombineTo(
44261 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44262 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44263 }
44264
44265 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44266 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44267
44268 // If the length is 0, the result is 0.
44269 if (Length == 0) {
44270 Known.setAllZero();
44271 return false;
44272 }
44273
44274 if ((Shift + Length) <= BitWidth) {
44275 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44276 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44277 return true;
44278
44279 Known = Known.extractBits(Length, Shift);
44280 Known = Known.zextOrTrunc(BitWidth);
44281 return false;
44282 }
44283 } else {
44284 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44285 KnownBits Known1;
44286 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44287 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44288 return true;
44289
44290 // If the length is 0, replace with 0.
44291 KnownBits LengthBits = Known1.extractBits(8, 8);
44292 if (LengthBits.isZero())
44293 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44294 }
44295
44296 break;
44297 }
44298 case X86ISD::PDEP: {
44299 SDValue Op0 = Op.getOperand(0);
44300 SDValue Op1 = Op.getOperand(1);
44301
44302 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44303 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44304
44305 // If the demanded bits has leading zeroes, we don't demand those from the
44306 // mask.
44307 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44308 return true;
44309
44310 // The number of possible 1s in the mask determines the number of LSBs of
44311 // operand 0 used. Undemanded bits from the mask don't matter so filter
44312 // them before counting.
44313 KnownBits Known2;
44314 uint64_t Count = (~Known.Zero & LoMask).popcount();
44315 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44316 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44317 return true;
44318
44319 // Zeroes are retained from the mask, but not ones.
44320 Known.One.clearAllBits();
44321 // The result will have at least as many trailing zeros as the non-mask
44322 // operand since bits can only map to the same or higher bit position.
44323 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44324 return false;
44325 }
44326 }
44327
44329 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44330}
44331
44333 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
44334 SelectionDAG &DAG, unsigned Depth) const {
44335 int NumElts = DemandedElts.getBitWidth();
44336 unsigned Opc = Op.getOpcode();
44337 EVT VT = Op.getValueType();
44338
44339 switch (Opc) {
44340 case X86ISD::PINSRB:
44341 case X86ISD::PINSRW: {
44342 // If we don't demand the inserted element, return the base vector.
44343 SDValue Vec = Op.getOperand(0);
44344 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44345 MVT VecVT = Vec.getSimpleValueType();
44346 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44347 !DemandedElts[CIdx->getZExtValue()])
44348 return Vec;
44349 break;
44350 }
44351 case X86ISD::VSHLI: {
44352 // If we are only demanding sign bits then we can use the shift source
44353 // directly.
44354 SDValue Op0 = Op.getOperand(0);
44355 unsigned ShAmt = Op.getConstantOperandVal(1);
44356 unsigned BitWidth = DemandedBits.getBitWidth();
44357 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44358 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44359 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44360 return Op0;
44361 break;
44362 }
44363 case X86ISD::VSRAI:
44364 // iff we only need the sign bit then we can use the source directly.
44365 // TODO: generalize where we only demand extended signbits.
44366 if (DemandedBits.isSignMask())
44367 return Op.getOperand(0);
44368 break;
44369 case X86ISD::PCMPGT:
44370 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44371 // iff we only need the sign bit then we can use R directly.
44372 if (DemandedBits.isSignMask() &&
44373 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44374 return Op.getOperand(1);
44375 break;
44376 case X86ISD::BLENDV: {
44377 // BLENDV: Cond (MSB) ? LHS : RHS
44378 SDValue Cond = Op.getOperand(0);
44379 SDValue LHS = Op.getOperand(1);
44380 SDValue RHS = Op.getOperand(2);
44381
44382 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
44383 if (CondKnown.isNegative())
44384 return LHS;
44385 if (CondKnown.isNonNegative())
44386 return RHS;
44387 break;
44388 }
44389 case X86ISD::ANDNP: {
44390 // ANDNP = (~LHS & RHS);
44391 SDValue LHS = Op.getOperand(0);
44392 SDValue RHS = Op.getOperand(1);
44393
44394 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44395 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44396
44397 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
44398 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
44399 // this context, so return RHS.
44400 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
44401 return RHS;
44402 break;
44403 }
44404 }
44405
44406 APInt ShuffleUndef, ShuffleZero;
44407 SmallVector<int, 16> ShuffleMask;
44409 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
44410 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44411 // If all the demanded elts are from one operand and are inline,
44412 // then we can use the operand directly.
44413 int NumOps = ShuffleOps.size();
44414 if (ShuffleMask.size() == (unsigned)NumElts &&
44416 return VT.getSizeInBits() == V.getValueSizeInBits();
44417 })) {
44418
44419 if (DemandedElts.isSubsetOf(ShuffleUndef))
44420 return DAG.getUNDEF(VT);
44421 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
44422 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
44423
44424 // Bitmask that indicates which ops have only been accessed 'inline'.
44425 APInt IdentityOp = APInt::getAllOnes(NumOps);
44426 for (int i = 0; i != NumElts; ++i) {
44427 int M = ShuffleMask[i];
44428 if (!DemandedElts[i] || ShuffleUndef[i])
44429 continue;
44430 int OpIdx = M / NumElts;
44431 int EltIdx = M % NumElts;
44432 if (M < 0 || EltIdx != i) {
44433 IdentityOp.clearAllBits();
44434 break;
44435 }
44436 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
44437 if (IdentityOp == 0)
44438 break;
44439 }
44440 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
44441 "Multiple identity shuffles detected");
44442
44443 if (IdentityOp != 0)
44444 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
44445 }
44446 }
44447
44449 Op, DemandedBits, DemandedElts, DAG, Depth);
44450}
44451
44453 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44454 bool PoisonOnly, unsigned Depth) const {
44455 unsigned NumElts = DemandedElts.getBitWidth();
44456
44457 switch (Op.getOpcode()) {
44458 case X86ISD::PSHUFD:
44459 case X86ISD::VPERMILPI:
44460 case X86ISD::VPERMV3: {
44463 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
44464 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
44465 APInt::getZero(NumElts));
44466 for (auto M : enumerate(Mask)) {
44467 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
44468 continue;
44469 if (M.value() == SM_SentinelUndef)
44470 return false;
44471 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
44472 "Shuffle mask index out of range");
44473 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
44474 }
44475 for (auto Op : enumerate(Ops))
44476 if (!DemandedSrcElts[Op.index()].isZero() &&
44478 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
44479 return false;
44480 return true;
44481 }
44482 break;
44483 }
44484 }
44486 Op, DemandedElts, DAG, PoisonOnly, Depth);
44487}
44488
44490 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44491 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44492
44493 switch (Op.getOpcode()) {
44494 // SSE vector multiplies are either inbounds or saturate.
44495 case X86ISD::VPMADDUBSW:
44496 case X86ISD::VPMADDWD:
44497 // SSE vector shifts handle out of bounds shift amounts.
44498 case X86ISD::VSHLI:
44499 case X86ISD::VSRLI:
44500 case X86ISD::VSRAI:
44501 return false;
44502 case X86ISD::PSHUFD:
44503 case X86ISD::VPERMILPI:
44504 case X86ISD::VPERMV3:
44505 case X86ISD::UNPCKH:
44506 case X86ISD::UNPCKL:
44507 return false;
44508 // SSE comparisons handle all fcmp cases.
44509 // TODO: Add PCMPEQ/GT and CMPM/MM with test coverage.
44510 case X86ISD::CMPP:
44511 return false;
44513 switch (Op->getConstantOperandVal(0)) {
44514 case Intrinsic::x86_sse2_pmadd_wd:
44515 case Intrinsic::x86_avx2_pmadd_wd:
44516 case Intrinsic::x86_avx512_pmaddw_d_512:
44517 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
44518 case Intrinsic::x86_avx2_pmadd_ub_sw:
44519 case Intrinsic::x86_avx512_pmaddubs_w_512:
44520 return false;
44521 }
44522 }
44524 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44525}
44526
44528 const APInt &DemandedElts,
44529 APInt &UndefElts,
44530 const SelectionDAG &DAG,
44531 unsigned Depth) const {
44532 unsigned NumElts = DemandedElts.getBitWidth();
44533 unsigned Opc = Op.getOpcode();
44534
44535 switch (Opc) {
44536 case X86ISD::VBROADCAST:
44538 UndefElts = APInt::getZero(NumElts);
44539 return true;
44540 }
44541
44542 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44543 DAG, Depth);
44544}
44545
44546// Helper to peek through bitops/trunc/setcc to determine size of source vector.
44547// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44548static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44549 bool AllowTruncate) {
44550 switch (Src.getOpcode()) {
44551 case ISD::TRUNCATE:
44552 if (!AllowTruncate)
44553 return false;
44554 [[fallthrough]];
44555 case ISD::SETCC:
44556 return Src.getOperand(0).getValueSizeInBits() == Size;
44557 case ISD::FREEZE:
44558 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
44559 case ISD::AND:
44560 case ISD::XOR:
44561 case ISD::OR:
44562 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44563 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44564 case ISD::SELECT:
44565 case ISD::VSELECT:
44566 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44567 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44568 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44569 case ISD::BUILD_VECTOR:
44570 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44571 ISD::isBuildVectorAllOnes(Src.getNode());
44572 }
44573 return false;
44574}
44575
44576// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44577static unsigned getAltBitOpcode(unsigned Opcode) {
44578 switch(Opcode) {
44579 // clang-format off
44580 case ISD::AND: return X86ISD::FAND;
44581 case ISD::OR: return X86ISD::FOR;
44582 case ISD::XOR: return X86ISD::FXOR;
44583 case X86ISD::ANDNP: return X86ISD::FANDN;
44584 // clang-format on
44585 }
44586 llvm_unreachable("Unknown bitwise opcode");
44587}
44588
44589// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44591 const SDLoc &DL) {
44592 EVT SrcVT = Src.getValueType();
44593 if (SrcVT != MVT::v4i1)
44594 return SDValue();
44595
44596 switch (Src.getOpcode()) {
44597 case ISD::SETCC:
44598 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44599 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44600 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44601 SDValue Op0 = Src.getOperand(0);
44602 if (ISD::isNormalLoad(Op0.getNode()))
44603 return DAG.getBitcast(MVT::v4f32, Op0);
44604 if (Op0.getOpcode() == ISD::BITCAST &&
44605 Op0.getOperand(0).getValueType() == MVT::v4f32)
44606 return Op0.getOperand(0);
44607 }
44608 break;
44609 case ISD::AND:
44610 case ISD::XOR:
44611 case ISD::OR: {
44612 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44613 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44614 if (Op0 && Op1)
44615 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44616 Op1);
44617 break;
44618 }
44619 }
44620 return SDValue();
44621}
44622
44623// Helper to push sign extension of vXi1 SETCC result through bitops.
44625 SDValue Src, const SDLoc &DL) {
44626 switch (Src.getOpcode()) {
44627 case ISD::SETCC:
44628 case ISD::FREEZE:
44629 case ISD::TRUNCATE:
44630 case ISD::BUILD_VECTOR:
44631 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44632 case ISD::AND:
44633 case ISD::XOR:
44634 case ISD::OR:
44635 return DAG.getNode(
44636 Src.getOpcode(), DL, SExtVT,
44637 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44638 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44639 case ISD::SELECT:
44640 case ISD::VSELECT:
44641 return DAG.getSelect(
44642 DL, SExtVT, Src.getOperand(0),
44643 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44644 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44645 }
44646 llvm_unreachable("Unexpected node type for vXi1 sign extension");
44647}
44648
44649// Try to match patterns such as
44650// (i16 bitcast (v16i1 x))
44651// ->
44652// (i16 movmsk (16i8 sext (v16i1 x)))
44653// before the illegal vector is scalarized on subtargets that don't have legal
44654// vxi1 types.
44656 const SDLoc &DL,
44657 const X86Subtarget &Subtarget) {
44658 EVT SrcVT = Src.getValueType();
44659 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44660 return SDValue();
44661
44662 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44663 // legalization destroys the v4i32 type.
44664 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44665 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44666 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44667 DAG.getBitcast(MVT::v4f32, V));
44668 return DAG.getZExtOrTrunc(V, DL, VT);
44669 }
44670 }
44671
44672 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44673 // movmskb even with avx512. This will be better than truncating to vXi1 and
44674 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44675 // vpcmpeqb/vpcmpgtb.
44676 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44677 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44678 Src.getOperand(0).getValueType() == MVT::v32i8 ||
44679 Src.getOperand(0).getValueType() == MVT::v64i8);
44680
44681 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44682 // directly with vpmovmskb/vmovmskps/vmovmskpd.
44683 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44684 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44685 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44686 EVT CmpVT = Src.getOperand(0).getValueType();
44687 EVT EltVT = CmpVT.getVectorElementType();
44688 if (CmpVT.getSizeInBits() <= 256 &&
44689 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44690 PreferMovMsk = true;
44691 }
44692
44693 // With AVX512 vxi1 types are legal and we prefer using k-regs.
44694 // MOVMSK is supported in SSE2 or later.
44695 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44696 return SDValue();
44697
44698 // If the upper ops of a concatenation are undef, then try to bitcast the
44699 // lower op and extend.
44700 SmallVector<SDValue, 4> SubSrcOps;
44701 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44702 SubSrcOps.size() >= 2) {
44703 SDValue LowerOp = SubSrcOps[0];
44704 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44705 if (LowerOp.getOpcode() == ISD::SETCC &&
44706 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44707 EVT SubVT = VT.getIntegerVT(
44708 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44709 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44710 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44711 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44712 }
44713 }
44714 }
44715
44716 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44717 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44718 // v8i16 and v16i16.
44719 // For these two cases, we can shuffle the upper element bytes to a
44720 // consecutive sequence at the start of the vector and treat the results as
44721 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44722 // for v16i16 this is not the case, because the shuffle is expensive, so we
44723 // avoid sign-extending to this type entirely.
44724 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44725 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44726 MVT SExtVT;
44727 bool PropagateSExt = false;
44728 switch (SrcVT.getSimpleVT().SimpleTy) {
44729 default:
44730 return SDValue();
44731 case MVT::v2i1:
44732 SExtVT = MVT::v2i64;
44733 break;
44734 case MVT::v4i1:
44735 SExtVT = MVT::v4i32;
44736 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44737 // sign-extend to a 256-bit operation to avoid truncation.
44738 if (Subtarget.hasAVX() &&
44739 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44740 SExtVT = MVT::v4i64;
44741 PropagateSExt = true;
44742 }
44743 break;
44744 case MVT::v8i1:
44745 SExtVT = MVT::v8i16;
44746 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44747 // sign-extend to a 256-bit operation to match the compare.
44748 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44749 // 256-bit because the shuffle is cheaper than sign extending the result of
44750 // the compare.
44751 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44752 checkBitcastSrcVectorSize(Src, 512, true))) {
44753 SExtVT = MVT::v8i32;
44754 PropagateSExt = true;
44755 }
44756 break;
44757 case MVT::v16i1:
44758 SExtVT = MVT::v16i8;
44759 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44760 // it is not profitable to sign-extend to 256-bit because this will
44761 // require an extra cross-lane shuffle which is more expensive than
44762 // truncating the result of the compare to 128-bits.
44763 break;
44764 case MVT::v32i1:
44765 SExtVT = MVT::v32i8;
44766 break;
44767 case MVT::v64i1:
44768 // If we have AVX512F, but not AVX512BW and the input is truncated from
44769 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44770 if (Subtarget.hasAVX512()) {
44771 if (Subtarget.hasBWI())
44772 return SDValue();
44773 SExtVT = MVT::v64i8;
44774 break;
44775 }
44776 // Split if this is a <64 x i8> comparison result.
44777 if (checkBitcastSrcVectorSize(Src, 512, false)) {
44778 SExtVT = MVT::v64i8;
44779 break;
44780 }
44781 return SDValue();
44782 };
44783
44784 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44785 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44786
44787 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44788 V = getPMOVMSKB(DL, V, DAG, Subtarget);
44789 } else {
44790 if (SExtVT == MVT::v8i16) {
44791 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
44792 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
44793 }
44794 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44795 }
44796
44797 EVT IntVT =
44799 V = DAG.getZExtOrTrunc(V, DL, IntVT);
44800 return DAG.getBitcast(VT, V);
44801}
44802
44803// Convert a vXi1 constant build vector to the same width scalar integer.
44805 EVT SrcVT = Op.getValueType();
44806 assert(SrcVT.getVectorElementType() == MVT::i1 &&
44807 "Expected a vXi1 vector");
44809 "Expected a constant build vector");
44810
44811 APInt Imm(SrcVT.getVectorNumElements(), 0);
44812 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44813 SDValue In = Op.getOperand(Idx);
44814 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
44815 Imm.setBit(Idx);
44816 }
44817 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44818 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44819}
44820
44823 const X86Subtarget &Subtarget) {
44824 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
44825
44826 if (!DCI.isBeforeLegalizeOps())
44827 return SDValue();
44828
44829 // Only do this if we have k-registers.
44830 if (!Subtarget.hasAVX512())
44831 return SDValue();
44832
44833 EVT DstVT = N->getValueType(0);
44834 SDValue Op = N->getOperand(0);
44835 EVT SrcVT = Op.getValueType();
44836
44837 if (!Op.hasOneUse())
44838 return SDValue();
44839
44840 // Look for logic ops.
44841 if (Op.getOpcode() != ISD::AND &&
44842 Op.getOpcode() != ISD::OR &&
44843 Op.getOpcode() != ISD::XOR)
44844 return SDValue();
44845
44846 // Make sure we have a bitcast between mask registers and a scalar type.
44847 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44848 DstVT.isScalarInteger()) &&
44849 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44850 SrcVT.isScalarInteger()))
44851 return SDValue();
44852
44853 SDValue LHS = Op.getOperand(0);
44854 SDValue RHS = Op.getOperand(1);
44855
44856 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44857 LHS.getOperand(0).getValueType() == DstVT)
44858 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44859 DAG.getBitcast(DstVT, RHS));
44860
44861 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44862 RHS.getOperand(0).getValueType() == DstVT)
44863 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44864 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44865
44866 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44867 // Most of these have to move a constant from the scalar domain anyway.
44870 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44871 DAG.getBitcast(DstVT, LHS), RHS);
44872 }
44873
44874 return SDValue();
44875}
44876
44878 const X86Subtarget &Subtarget) {
44879 SDLoc DL(BV);
44880 unsigned NumElts = BV->getNumOperands();
44881 SDValue Splat = BV->getSplatValue();
44882
44883 // Build MMX element from integer GPR or SSE float values.
44884 auto CreateMMXElement = [&](SDValue V) {
44885 if (V.isUndef())
44886 return DAG.getUNDEF(MVT::x86mmx);
44887 if (V.getValueType().isFloatingPoint()) {
44888 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44889 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44890 V = DAG.getBitcast(MVT::v2i64, V);
44891 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44892 }
44893 V = DAG.getBitcast(MVT::i32, V);
44894 } else {
44895 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44896 }
44897 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44898 };
44899
44900 // Convert build vector ops to MMX data in the bottom elements.
44902
44903 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44904
44905 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44906 if (Splat) {
44907 if (Splat.isUndef())
44908 return DAG.getUNDEF(MVT::x86mmx);
44909
44910 Splat = CreateMMXElement(Splat);
44911
44912 if (Subtarget.hasSSE1()) {
44913 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44914 if (NumElts == 8)
44915 Splat = DAG.getNode(
44916 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44917 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44918 TLI.getPointerTy(DAG.getDataLayout())),
44919 Splat, Splat);
44920
44921 // Use PSHUFW to repeat 16-bit elements.
44922 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44923 return DAG.getNode(
44924 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44925 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44926 TLI.getPointerTy(DAG.getDataLayout())),
44927 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44928 }
44929 Ops.append(NumElts, Splat);
44930 } else {
44931 for (unsigned i = 0; i != NumElts; ++i)
44932 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44933 }
44934
44935 // Use tree of PUNPCKLs to build up general MMX vector.
44936 while (Ops.size() > 1) {
44937 unsigned NumOps = Ops.size();
44938 unsigned IntrinOp =
44939 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44940 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44941 : Intrinsic::x86_mmx_punpcklbw));
44942 SDValue Intrin = DAG.getTargetConstant(
44943 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44944 for (unsigned i = 0; i != NumOps; i += 2)
44945 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44946 Ops[i], Ops[i + 1]);
44947 Ops.resize(NumOps / 2);
44948 }
44949
44950 return Ops[0];
44951}
44952
44953// Recursive function that attempts to find if a bool vector node was originally
44954// a vector/float/double that got truncated/extended/bitcast to/from a scalar
44955// integer. If so, replace the scalar ops with bool vector equivalents back down
44956// the chain.
44958 SelectionDAG &DAG,
44959 const X86Subtarget &Subtarget,
44960 unsigned Depth = 0) {
44962 return SDValue(); // Limit search depth.
44963
44964 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44965 unsigned Opc = V.getOpcode();
44966 switch (Opc) {
44967 case ISD::BITCAST: {
44968 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
44969 SDValue Src = V.getOperand(0);
44970 EVT SrcVT = Src.getValueType();
44971 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
44972 return DAG.getBitcast(VT, Src);
44973 break;
44974 }
44975 case ISD::Constant: {
44976 auto *C = cast<ConstantSDNode>(V);
44977 if (C->isZero())
44978 return DAG.getConstant(0, DL, VT);
44979 if (C->isAllOnes())
44980 return DAG.getAllOnesConstant(DL, VT);
44981 break;
44982 }
44983 case ISD::TRUNCATE: {
44984 // If we find a suitable source, a truncated scalar becomes a subvector.
44985 SDValue Src = V.getOperand(0);
44986 EVT NewSrcVT =
44987 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
44988 if (TLI.isTypeLegal(NewSrcVT))
44989 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
44990 Subtarget, Depth + 1))
44991 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
44992 DAG.getVectorIdxConstant(0, DL));
44993 break;
44994 }
44995 case ISD::ANY_EXTEND:
44996 case ISD::ZERO_EXTEND: {
44997 // If we find a suitable source, an extended scalar becomes a subvector.
44998 SDValue Src = V.getOperand(0);
44999 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45000 Src.getScalarValueSizeInBits());
45001 if (TLI.isTypeLegal(NewSrcVT))
45002 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45003 Subtarget, Depth + 1))
45004 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45005 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45006 : DAG.getConstant(0, DL, VT),
45007 N0, DAG.getVectorIdxConstant(0, DL));
45008 break;
45009 }
45010 case ISD::OR:
45011 case ISD::XOR: {
45012 // If we find suitable sources, we can just move the op to the vector
45013 // domain.
45014 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45015 Subtarget, Depth + 1))
45016 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45017 Subtarget, Depth + 1))
45018 return DAG.getNode(Opc, DL, VT, N0, N1);
45019 break;
45020 }
45021 case ISD::SHL: {
45022 // If we find a suitable source, a SHL becomes a KSHIFTL.
45023 SDValue Src0 = V.getOperand(0);
45024 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45025 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45026 break;
45027
45028 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45029 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45030 Depth + 1))
45031 return DAG.getNode(
45032 X86ISD::KSHIFTL, DL, VT, N0,
45033 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45034 break;
45035 }
45036 }
45037
45038 // Does the inner bitcast already exist?
45039 if (Depth > 0)
45040 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45041 return SDValue(Alt, 0);
45042
45043 return SDValue();
45044}
45045
45048 const X86Subtarget &Subtarget) {
45049 SDValue N0 = N->getOperand(0);
45050 EVT VT = N->getValueType(0);
45051 EVT SrcVT = N0.getValueType();
45052 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45053
45054 // Try to match patterns such as
45055 // (i16 bitcast (v16i1 x))
45056 // ->
45057 // (i16 movmsk (16i8 sext (v16i1 x)))
45058 // before the setcc result is scalarized on subtargets that don't have legal
45059 // vxi1 types.
45060 if (DCI.isBeforeLegalize()) {
45061 SDLoc dl(N);
45062 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45063 return V;
45064
45065 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45066 // type, widen both sides to avoid a trip through memory.
45067 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45068 Subtarget.hasAVX512()) {
45069 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45070 N0 = DAG.getBitcast(MVT::v8i1, N0);
45071 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45072 DAG.getVectorIdxConstant(0, dl));
45073 }
45074
45075 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45076 // type, widen both sides to avoid a trip through memory.
45077 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45078 Subtarget.hasAVX512()) {
45079 // Use zeros for the widening if we already have some zeroes. This can
45080 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45081 // stream of this.
45082 // FIXME: It might make sense to detect a concat_vectors with a mix of
45083 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45084 // a separate combine. What we can't do is canonicalize the operands of
45085 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45086 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45087 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45088 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45089 SrcVT = LastOp.getValueType();
45090 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45091 SmallVector<SDValue, 4> Ops(N0->ops());
45092 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45093 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45094 N0 = DAG.getBitcast(MVT::i8, N0);
45095 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45096 }
45097 }
45098
45099 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45100 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45101 Ops[0] = N0;
45102 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45103 N0 = DAG.getBitcast(MVT::i8, N0);
45104 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45105 }
45106 } else if (DCI.isAfterLegalizeDAG()) {
45107 // If we're bitcasting from iX to vXi1, see if the integer originally
45108 // began as a vXi1 and whether we can remove the bitcast entirely.
45109 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45110 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45111 if (SDValue V =
45112 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45113 return V;
45114 }
45115 }
45116
45117 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45118 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45119 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45120 // we can help with known bits propagation from the vXi1 domain to the
45121 // scalar domain.
45122 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45123 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45124 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45126 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45127 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45128
45129 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45130 // and the vbroadcast_load are both integer or both fp. In some cases this
45131 // will remove the bitcast entirely.
45132 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45133 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45134 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45135 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45136 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45137 // Don't swap i8/i16 since don't have fp types that size.
45138 if (MemSize >= 32) {
45139 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45140 : MVT::getIntegerVT(MemSize);
45141 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45142 : MVT::getIntegerVT(SrcVTSize);
45143 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45144
45145 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45146 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45147 SDValue ResNode =
45149 MemVT, BCast->getMemOperand());
45150 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45151 return DAG.getBitcast(VT, ResNode);
45152 }
45153 }
45154
45155 // Since MMX types are special and don't usually play with other vector types,
45156 // it's better to handle them early to be sure we emit efficient code by
45157 // avoiding store-load conversions.
45158 if (VT == MVT::x86mmx) {
45159 // Detect MMX constant vectors.
45160 APInt UndefElts;
45161 SmallVector<APInt, 1> EltBits;
45162 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45163 /*AllowWholeUndefs*/ true,
45164 /*AllowPartialUndefs*/ true)) {
45165 SDLoc DL(N0);
45166 // Handle zero-extension of i32 with MOVD.
45167 if (EltBits[0].countl_zero() >= 32)
45168 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45169 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45170 // Else, bitcast to a double.
45171 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45172 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45173 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45174 }
45175
45176 // Detect bitcasts to x86mmx low word.
45177 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45178 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45179 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45180 bool LowUndef = true, AllUndefOrZero = true;
45181 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45182 SDValue Op = N0.getOperand(i);
45183 LowUndef &= Op.isUndef() || (i >= e/2);
45184 AllUndefOrZero &= isNullConstantOrUndef(Op);
45185 }
45186 if (AllUndefOrZero) {
45187 SDValue N00 = N0.getOperand(0);
45188 SDLoc dl(N00);
45189 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45190 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45191 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45192 }
45193 }
45194
45195 // Detect bitcasts of 64-bit build vectors and convert to a
45196 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45197 // lowest element.
45198 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45199 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45200 SrcVT == MVT::v8i8))
45201 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45202
45203 // Detect bitcasts between element or subvector extraction to x86mmx.
45204 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45206 isNullConstant(N0.getOperand(1))) {
45207 SDValue N00 = N0.getOperand(0);
45208 if (N00.getValueType().is128BitVector())
45209 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45210 DAG.getBitcast(MVT::v2i64, N00));
45211 }
45212
45213 // Detect bitcasts from FP_TO_SINT to x86mmx.
45214 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45215 SDLoc DL(N0);
45216 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45217 DAG.getUNDEF(MVT::v2i32));
45218 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45219 DAG.getBitcast(MVT::v2i64, Res));
45220 }
45221 }
45222
45223 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45224 // most of these to scalar anyway.
45225 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45226 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45228 return combinevXi1ConstantToInteger(N0, DAG);
45229 }
45230
45231 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
45232 VT.getVectorElementType() == MVT::i1) {
45233 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
45234 if (C->isAllOnes())
45235 return DAG.getConstant(1, SDLoc(N0), VT);
45236 if (C->isZero())
45237 return DAG.getConstant(0, SDLoc(N0), VT);
45238 }
45239 }
45240
45241 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45242 // Turn it into a sign bit compare that produces a k-register. This avoids
45243 // a trip through a GPR.
45244 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45245 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45247 unsigned NumElts = VT.getVectorNumElements();
45248 SDValue Src = N0;
45249
45250 // Peek through truncate.
45251 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
45252 Src = N0.getOperand(0);
45253
45254 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
45255 SDValue MovmskIn = Src.getOperand(0);
45256 MVT MovmskVT = MovmskIn.getSimpleValueType();
45257 unsigned MovMskElts = MovmskVT.getVectorNumElements();
45258
45259 // We allow extra bits of the movmsk to be used since they are known zero.
45260 // We can't convert a VPMOVMSKB without avx512bw.
45261 if (MovMskElts <= NumElts &&
45262 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
45263 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
45264 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
45265 SDLoc dl(N);
45266 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
45267 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
45268 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
45269 if (EVT(CmpVT) == VT)
45270 return Cmp;
45271
45272 // Pad with zeroes up to original VT to replace the zeroes that were
45273 // being used from the MOVMSK.
45274 unsigned NumConcats = NumElts / MovMskElts;
45275 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
45276 Ops[0] = Cmp;
45277 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
45278 }
45279 }
45280 }
45281
45282 // Try to remove bitcasts from input and output of mask arithmetic to
45283 // remove GPR<->K-register crossings.
45284 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
45285 return V;
45286
45287 // Convert a bitcasted integer logic operation that has one bitcasted
45288 // floating-point operand into a floating-point logic operation. This may
45289 // create a load of a constant, but that is cheaper than materializing the
45290 // constant in an integer register and transferring it to an SSE register or
45291 // transferring the SSE operand to integer register and back.
45292 unsigned FPOpcode;
45293 switch (N0.getOpcode()) {
45294 // clang-format off
45295 case ISD::AND: FPOpcode = X86ISD::FAND; break;
45296 case ISD::OR: FPOpcode = X86ISD::FOR; break;
45297 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45298 default: return SDValue();
45299 // clang-format on
45300 }
45301
45302 // Check if we have a bitcast from another integer type as well.
45303 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
45304 (Subtarget.hasSSE2() && VT == MVT::f64) ||
45305 (Subtarget.hasFP16() && VT == MVT::f16) ||
45306 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
45307 TLI.isTypeLegal(VT))))
45308 return SDValue();
45309
45310 SDValue LogicOp0 = N0.getOperand(0);
45311 SDValue LogicOp1 = N0.getOperand(1);
45312 SDLoc DL0(N0);
45313
45314 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45315 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
45316 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
45317 LogicOp0.getOperand(0).getValueType() == VT &&
45318 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
45319 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
45320 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45321 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
45322 }
45323 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45324 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
45325 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
45326 LogicOp1.getOperand(0).getValueType() == VT &&
45327 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
45328 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
45329 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45330 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
45331 }
45332
45333 return SDValue();
45334}
45335
45336// (mul (zext a), (sext, b))
45337static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
45338 SDValue &Op1) {
45339 Op0 = Mul.getOperand(0);
45340 Op1 = Mul.getOperand(1);
45341
45342 // The operand1 should be signed extend
45343 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
45344 std::swap(Op0, Op1);
45345
45346 auto IsFreeTruncation = [](SDValue &Op) -> bool {
45347 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
45348 Op.getOpcode() == ISD::SIGN_EXTEND) &&
45349 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
45350 return true;
45351
45352 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
45353 return (BV && BV->isConstant());
45354 };
45355
45356 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
45357 // value, we need to check Op0 is zero extended value. Op1 should be signed
45358 // value, so we just check the signed bits.
45359 if ((IsFreeTruncation(Op0) &&
45360 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
45361 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
45362 return true;
45363
45364 return false;
45365}
45366
45367// Given a ABS node, detect the following pattern:
45368// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
45369// This is useful as it is the input into a SAD pattern.
45370static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
45371 SDValue AbsOp1 = Abs->getOperand(0);
45372 if (AbsOp1.getOpcode() != ISD::SUB)
45373 return false;
45374
45375 Op0 = AbsOp1.getOperand(0);
45376 Op1 = AbsOp1.getOperand(1);
45377
45378 // Check if the operands of the sub are zero-extended from vectors of i8.
45379 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
45380 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
45381 Op1.getOpcode() != ISD::ZERO_EXTEND ||
45382 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
45383 return false;
45384
45385 return true;
45386}
45387
45389 unsigned &LogBias, const SDLoc &DL,
45390 const X86Subtarget &Subtarget) {
45391 // Extend or truncate to MVT::i8 first.
45392 MVT Vi8VT =
45393 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
45394 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
45395 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
45396
45397 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
45398 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
45399 // The src A, B element type is i8, but the dst C element type is i32.
45400 // When we calculate the reduce stage, we use src vector type vXi8 for it
45401 // so we need logbias 2 to avoid extra 2 stages.
45402 LogBias = 2;
45403
45404 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
45405 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
45406 RegSize = std::max(512u, RegSize);
45407
45408 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45409 // fill in the missing vector elements with 0.
45410 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
45411 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
45412 Ops[0] = LHS;
45413 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45414 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45415 Ops[0] = RHS;
45416 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45417
45418 // Actually build the DotProduct, split as 256/512 bits for
45419 // AVXVNNI/AVX512VNNI.
45420 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45421 ArrayRef<SDValue> Ops) {
45422 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45423 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
45424 };
45425 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
45426 SDValue Zero = DAG.getConstant(0, DL, DpVT);
45427
45428 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
45429 DpBuilder, false);
45430}
45431
45432// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
45433// to these zexts.
45434static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
45435 const SDValue &Zext1, const SDLoc &DL,
45436 const X86Subtarget &Subtarget) {
45437 // Find the appropriate width for the PSADBW.
45438 EVT InVT = Zext0.getOperand(0).getValueType();
45439 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
45440
45441 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45442 // fill in the missing vector elements with 0.
45443 unsigned NumConcat = RegSize / InVT.getSizeInBits();
45444 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
45445 Ops[0] = Zext0.getOperand(0);
45446 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45447 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45448 Ops[0] = Zext1.getOperand(0);
45449 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45450
45451 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45452 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45453 ArrayRef<SDValue> Ops) {
45454 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45455 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
45456 };
45457 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
45458 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
45459 PSADBWBuilder);
45460}
45461
45462// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
45463// PHMINPOSUW.
45465 const X86Subtarget &Subtarget) {
45466 // Bail without SSE41.
45467 if (!Subtarget.hasSSE41())
45468 return SDValue();
45469
45470 EVT ExtractVT = Extract->getValueType(0);
45471 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
45472 return SDValue();
45473
45474 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
45475 ISD::NodeType BinOp;
45476 SDValue Src = DAG.matchBinOpReduction(
45477 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
45478 if (!Src)
45479 return SDValue();
45480
45481 EVT SrcVT = Src.getValueType();
45482 EVT SrcSVT = SrcVT.getScalarType();
45483 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
45484 return SDValue();
45485
45486 SDLoc DL(Extract);
45487 SDValue MinPos = Src;
45488
45489 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45490 while (SrcVT.getSizeInBits() > 128) {
45491 SDValue Lo, Hi;
45492 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
45493 SrcVT = Lo.getValueType();
45494 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
45495 }
45496 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
45497 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
45498 "Unexpected value type");
45499
45500 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
45501 // to flip the value accordingly.
45502 SDValue Mask;
45503 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
45504 if (BinOp == ISD::SMAX)
45505 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
45506 else if (BinOp == ISD::SMIN)
45507 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
45508 else if (BinOp == ISD::UMAX)
45509 Mask = DAG.getAllOnesConstant(DL, SrcVT);
45510
45511 if (Mask)
45512 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45513
45514 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
45515 // shuffling each upper element down and insert zeros. This means that the
45516 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45517 // ready for the PHMINPOS.
45518 if (ExtractVT == MVT::i8) {
45520 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
45521 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
45522 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
45523 }
45524
45525 // Perform the PHMINPOS on a v8i16 vector,
45526 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
45527 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
45528 MinPos = DAG.getBitcast(SrcVT, MinPos);
45529
45530 if (Mask)
45531 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45532
45533 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
45534 DAG.getVectorIdxConstant(0, DL));
45535}
45536
45537// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
45539 const X86Subtarget &Subtarget) {
45540 // Bail without SSE2.
45541 if (!Subtarget.hasSSE2())
45542 return SDValue();
45543
45544 EVT ExtractVT = Extract->getValueType(0);
45545 unsigned BitWidth = ExtractVT.getSizeInBits();
45546 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
45547 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
45548 return SDValue();
45549
45550 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
45551 ISD::NodeType BinOp;
45552 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
45553 if (!Match && ExtractVT == MVT::i1)
45554 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
45555 if (!Match)
45556 return SDValue();
45557
45558 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
45559 // which we can't support here for now.
45560 if (Match.getScalarValueSizeInBits() != BitWidth)
45561 return SDValue();
45562
45563 SDValue Movmsk;
45564 SDLoc DL(Extract);
45565 EVT MatchVT = Match.getValueType();
45566 unsigned NumElts = MatchVT.getVectorNumElements();
45567 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45568 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45569 LLVMContext &Ctx = *DAG.getContext();
45570
45571 if (ExtractVT == MVT::i1) {
45572 // Special case for (pre-legalization) vXi1 reductions.
45573 if (NumElts > 64 || !isPowerOf2_32(NumElts))
45574 return SDValue();
45575 if (Match.getOpcode() == ISD::SETCC) {
45576 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45577 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45578 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45579 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45580 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45581 X86::CondCode X86CC;
45582 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45583 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45584 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45585 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45586 DAG, X86CC))
45587 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45588 getSETCC(X86CC, V, DL, DAG));
45589 }
45590 }
45591 if (TLI.isTypeLegal(MatchVT)) {
45592 // If this is a legal AVX512 predicate type then we can just bitcast.
45593 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45594 Movmsk = DAG.getBitcast(MovmskVT, Match);
45595 } else {
45596 // Use combineBitcastvxi1 to create the MOVMSK.
45597 while (NumElts > MaxElts) {
45598 SDValue Lo, Hi;
45599 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45600 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45601 NumElts /= 2;
45602 }
45603 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45604 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45605 }
45606 if (!Movmsk)
45607 return SDValue();
45608 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45609 } else {
45610 // FIXME: Better handling of k-registers or 512-bit vectors?
45611 unsigned MatchSizeInBits = Match.getValueSizeInBits();
45612 if (!(MatchSizeInBits == 128 ||
45613 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45614 return SDValue();
45615
45616 // Make sure this isn't a vector of 1 element. The perf win from using
45617 // MOVMSK diminishes with less elements in the reduction, but it is
45618 // generally better to get the comparison over to the GPRs as soon as
45619 // possible to reduce the number of vector ops.
45620 if (Match.getValueType().getVectorNumElements() < 2)
45621 return SDValue();
45622
45623 // Check that we are extracting a reduction of all sign bits.
45624 if (DAG.ComputeNumSignBits(Match) != BitWidth)
45625 return SDValue();
45626
45627 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45628 SDValue Lo, Hi;
45629 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45630 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45631 MatchSizeInBits = Match.getValueSizeInBits();
45632 }
45633
45634 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45635 MVT MaskSrcVT;
45636 if (64 == BitWidth || 32 == BitWidth)
45638 MatchSizeInBits / BitWidth);
45639 else
45640 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45641
45642 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45643 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45644 NumElts = MaskSrcVT.getVectorNumElements();
45645 }
45646 assert((NumElts <= 32 || NumElts == 64) &&
45647 "Not expecting more than 64 elements");
45648
45649 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45650 if (BinOp == ISD::XOR) {
45651 // parity -> (PARITY(MOVMSK X))
45652 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45653 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45654 }
45655
45656 SDValue CmpC;
45657 ISD::CondCode CondCode;
45658 if (BinOp == ISD::OR) {
45659 // any_of -> MOVMSK != 0
45660 CmpC = DAG.getConstant(0, DL, CmpVT);
45661 CondCode = ISD::CondCode::SETNE;
45662 } else {
45663 // all_of -> MOVMSK == ((1 << NumElts) - 1)
45664 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45665 DL, CmpVT);
45666 CondCode = ISD::CondCode::SETEQ;
45667 }
45668
45669 // The setcc produces an i8 of 0/1, so extend that to the result width and
45670 // negate to get the final 0/-1 mask value.
45671 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45672 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45673 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45674 return DAG.getNegative(Zext, DL, ExtractVT);
45675}
45676
45678 const X86Subtarget &Subtarget) {
45679 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45680 return SDValue();
45681
45682 EVT ExtractVT = Extract->getValueType(0);
45683 // Verify the type we're extracting is i32, as the output element type of
45684 // vpdpbusd is i32.
45685 if (ExtractVT != MVT::i32)
45686 return SDValue();
45687
45688 EVT VT = Extract->getOperand(0).getValueType();
45690 return SDValue();
45691
45692 // Match shuffle + add pyramid.
45693 ISD::NodeType BinOp;
45694 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45695
45696 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45697 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45698 // before adding into the accumulator.
45699 // TODO:
45700 // We also need to verify that the multiply has at least 2x the number of bits
45701 // of the input. We shouldn't match
45702 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45703 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45704 // Root = Root.getOperand(0);
45705
45706 // If there was a match, we want Root to be a mul.
45707 if (!Root || Root.getOpcode() != ISD::MUL)
45708 return SDValue();
45709
45710 // Check whether we have an extend and mul pattern
45711 SDValue LHS, RHS;
45712 if (!detectExtMul(DAG, Root, LHS, RHS))
45713 return SDValue();
45714
45715 // Create the dot product instruction.
45716 SDLoc DL(Extract);
45717 unsigned StageBias;
45718 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45719
45720 // If the original vector was wider than 4 elements, sum over the results
45721 // in the DP vector.
45722 unsigned Stages = Log2_32(VT.getVectorNumElements());
45723 EVT DpVT = DP.getValueType();
45724
45725 if (Stages > StageBias) {
45726 unsigned DpElems = DpVT.getVectorNumElements();
45727
45728 for (unsigned i = Stages - StageBias; i > 0; --i) {
45729 SmallVector<int, 16> Mask(DpElems, -1);
45730 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45731 Mask[j] = MaskEnd + j;
45732
45733 SDValue Shuffle =
45734 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45735 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45736 }
45737 }
45738
45739 // Return the lowest ExtractSizeInBits bits.
45740 EVT ResVT =
45741 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45742 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45743 DP = DAG.getBitcast(ResVT, DP);
45744 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45745 Extract->getOperand(1));
45746}
45747
45749 const X86Subtarget &Subtarget) {
45750 // PSADBW is only supported on SSE2 and up.
45751 if (!Subtarget.hasSSE2())
45752 return SDValue();
45753
45754 EVT ExtractVT = Extract->getValueType(0);
45755 // Verify the type we're extracting is either i32 or i64.
45756 // FIXME: Could support other types, but this is what we have coverage for.
45757 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45758 return SDValue();
45759
45760 EVT VT = Extract->getOperand(0).getValueType();
45762 return SDValue();
45763
45764 // Match shuffle + add pyramid.
45765 ISD::NodeType BinOp;
45766 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45767
45768 // The operand is expected to be zero extended from i8
45769 // (verified in detectZextAbsDiff).
45770 // In order to convert to i64 and above, additional any/zero/sign
45771 // extend is expected.
45772 // The zero extend from 32 bit has no mathematical effect on the result.
45773 // Also the sign extend is basically zero extend
45774 // (extends the sign bit which is zero).
45775 // So it is correct to skip the sign/zero extend instruction.
45776 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
45777 Root.getOpcode() == ISD::ZERO_EXTEND ||
45778 Root.getOpcode() == ISD::ANY_EXTEND))
45779 Root = Root.getOperand(0);
45780
45781 // If there was a match, we want Root to be a select that is the root of an
45782 // abs-diff pattern.
45783 if (!Root || Root.getOpcode() != ISD::ABS)
45784 return SDValue();
45785
45786 // Check whether we have an abs-diff pattern feeding into the select.
45787 SDValue Zext0, Zext1;
45788 if (!detectZextAbsDiff(Root, Zext0, Zext1))
45789 return SDValue();
45790
45791 // Create the SAD instruction.
45792 SDLoc DL(Extract);
45793 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45794
45795 // If the original vector was wider than 8 elements, sum over the results
45796 // in the SAD vector.
45797 unsigned Stages = Log2_32(VT.getVectorNumElements());
45798 EVT SadVT = SAD.getValueType();
45799 if (Stages > 3) {
45800 unsigned SadElems = SadVT.getVectorNumElements();
45801
45802 for(unsigned i = Stages - 3; i > 0; --i) {
45803 SmallVector<int, 16> Mask(SadElems, -1);
45804 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45805 Mask[j] = MaskEnd + j;
45806
45807 SDValue Shuffle =
45808 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45809 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45810 }
45811 }
45812
45813 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45814 // Return the lowest ExtractSizeInBits bits.
45815 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45816 SadVT.getSizeInBits() / ExtractSizeInBits);
45817 SAD = DAG.getBitcast(ResVT, SAD);
45818 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45819 Extract->getOperand(1));
45820}
45821
45822// If this extract is from a loaded vector value and will be used as an
45823// integer, that requires a potentially expensive XMM -> GPR transfer.
45824// Additionally, if we can convert to a scalar integer load, that will likely
45825// be folded into a subsequent integer op.
45826// Note: SrcVec might not have a VecVT type, but it must be the same size.
45827// Note: Unlike the related fold for this in DAGCombiner, this is not limited
45828// to a single-use of the loaded vector. For the reasons above, we
45829// expect this to be profitable even if it creates an extra load.
45830static SDValue
45832 const SDLoc &dl, SelectionDAG &DAG,
45834 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45835 "Only EXTRACT_VECTOR_ELT supported so far");
45836
45837 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45838 EVT VT = N->getValueType(0);
45839
45840 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
45841 return Use->getOpcode() == ISD::STORE ||
45842 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
45843 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
45844 });
45845
45846 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
45847 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
45848 VecVT.getVectorElementType() == VT &&
45849 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
45850 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
45851 SDValue NewPtr = TLI.getVectorElementPointer(
45852 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
45853 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
45854 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
45855 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
45856 SDValue Load =
45857 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
45858 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
45859 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
45860 return Load;
45861 }
45862
45863 return SDValue();
45864}
45865
45866// Attempt to peek through a target shuffle and extract the scalar from the
45867// source.
45870 const X86Subtarget &Subtarget) {
45871 if (DCI.isBeforeLegalizeOps())
45872 return SDValue();
45873
45874 SDLoc dl(N);
45875 SDValue Src = N->getOperand(0);
45876 SDValue Idx = N->getOperand(1);
45877
45878 EVT VT = N->getValueType(0);
45879 EVT SrcVT = Src.getValueType();
45880 EVT SrcSVT = SrcVT.getVectorElementType();
45881 unsigned SrcEltBits = SrcSVT.getSizeInBits();
45882 unsigned NumSrcElts = SrcVT.getVectorNumElements();
45883
45884 // Don't attempt this for boolean mask vectors or unknown extraction indices.
45885 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45886 return SDValue();
45887
45888 const APInt &IdxC = N->getConstantOperandAPInt(1);
45889 if (IdxC.uge(NumSrcElts))
45890 return SDValue();
45891
45892 SDValue SrcBC = peekThroughBitcasts(Src);
45893
45894 // Handle extract(bitcast(broadcast(scalar_value))).
45895 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45896 SDValue SrcOp = SrcBC.getOperand(0);
45897 EVT SrcOpVT = SrcOp.getValueType();
45898 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45899 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45900 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45901 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45902 // TODO support non-zero offsets.
45903 if (Offset == 0) {
45904 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45905 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45906 return SrcOp;
45907 }
45908 }
45909 }
45910
45911 // If we're extracting a single element from a broadcast load and there are
45912 // no other users, just create a single load.
45913 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45914 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45915 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45916 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45917 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45918 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45919 MemIntr->getBasePtr(),
45920 MemIntr->getPointerInfo(),
45921 MemIntr->getOriginalAlign(),
45922 MemIntr->getMemOperand()->getFlags());
45923 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45924 return Load;
45925 }
45926 }
45927
45928 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45929 // TODO: Move to DAGCombine?
45930 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45931 SrcBC.getValueType().isInteger() &&
45932 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45933 SrcBC.getScalarValueSizeInBits() ==
45934 SrcBC.getOperand(0).getValueSizeInBits()) {
45935 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45936 if (IdxC.ult(Scale)) {
45937 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45938 SDValue Scl = SrcBC.getOperand(0);
45939 EVT SclVT = Scl.getValueType();
45940 if (Offset) {
45941 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
45942 DAG.getShiftAmountConstant(Offset, SclVT, dl));
45943 }
45944 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
45945 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
45946 return Scl;
45947 }
45948 }
45949
45950 // Handle extract(truncate(x)) for 0'th index.
45951 // TODO: Treat this as a faux shuffle?
45952 // TODO: When can we use this for general indices?
45953 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
45954 (SrcVT.getSizeInBits() % 128) == 0) {
45955 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
45956 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
45957 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45958 Idx);
45959 }
45960
45961 // We can only legally extract other elements from 128-bit vectors and in
45962 // certain circumstances, depending on SSE-level.
45963 // TODO: Investigate float/double extraction if it will be just stored.
45964 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
45965 unsigned Idx) {
45966 EVT VecSVT = VecVT.getScalarType();
45967 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
45968 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
45969 VecSVT == MVT::i64)) {
45970 unsigned EltSizeInBits = VecSVT.getSizeInBits();
45971 unsigned NumEltsPerLane = 128 / EltSizeInBits;
45972 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45973 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
45974 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
45975 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
45976 Idx &= (NumEltsPerLane - 1);
45977 }
45978 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
45979 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
45980 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
45981 DAG.getBitcast(VecVT, Vec),
45982 DAG.getVectorIdxConstant(Idx, dl));
45983 }
45984 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
45985 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
45986 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
45987 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
45988 DAG.getTargetConstant(Idx, dl, MVT::i8));
45989 }
45990 return SDValue();
45991 };
45992
45993 // Resolve the target shuffle inputs and mask.
45996 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
45997 return SDValue();
45998
45999 // Shuffle inputs must be the same size as the result.
46000 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46001 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46002 }))
46003 return SDValue();
46004
46005 // Attempt to narrow/widen the shuffle mask to the correct size.
46006 if (Mask.size() != NumSrcElts) {
46007 if ((NumSrcElts % Mask.size()) == 0) {
46008 SmallVector<int, 16> ScaledMask;
46009 int Scale = NumSrcElts / Mask.size();
46010 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46011 Mask = std::move(ScaledMask);
46012 } else if ((Mask.size() % NumSrcElts) == 0) {
46013 // Simplify Mask based on demanded element.
46014 int ExtractIdx = (int)IdxC.getZExtValue();
46015 int Scale = Mask.size() / NumSrcElts;
46016 int Lo = Scale * ExtractIdx;
46017 int Hi = Scale * (ExtractIdx + 1);
46018 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46019 if (i < Lo || Hi <= i)
46020 Mask[i] = SM_SentinelUndef;
46021
46022 SmallVector<int, 16> WidenedMask;
46023 while (Mask.size() > NumSrcElts &&
46024 canWidenShuffleElements(Mask, WidenedMask))
46025 Mask = std::move(WidenedMask);
46026 }
46027 }
46028
46029 // If narrowing/widening failed, see if we can extract+zero-extend.
46030 int ExtractIdx;
46031 EVT ExtractVT;
46032 if (Mask.size() == NumSrcElts) {
46033 ExtractIdx = Mask[IdxC.getZExtValue()];
46034 ExtractVT = SrcVT;
46035 } else {
46036 unsigned Scale = Mask.size() / NumSrcElts;
46037 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46038 return SDValue();
46039 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46040 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46041 return SDValue();
46042 ExtractIdx = Mask[ScaledIdx];
46043 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46044 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46045 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46046 "Failed to widen vector type");
46047 }
46048
46049 // If the shuffle source element is undef/zero then we can just accept it.
46050 if (ExtractIdx == SM_SentinelUndef)
46051 return DAG.getUNDEF(VT);
46052
46053 if (ExtractIdx == SM_SentinelZero)
46054 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46055 : DAG.getConstant(0, dl, VT);
46056
46057 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46058 ExtractIdx = ExtractIdx % Mask.size();
46059 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46060 return DAG.getZExtOrTrunc(V, dl, VT);
46061
46062 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46064 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46065 return V;
46066
46067 return SDValue();
46068}
46069
46070/// Extracting a scalar FP value from vector element 0 is free, so extract each
46071/// operand first, then perform the math as a scalar op.
46073 const X86Subtarget &Subtarget,
46075 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46076 SDValue Vec = ExtElt->getOperand(0);
46077 SDValue Index = ExtElt->getOperand(1);
46078 EVT VT = ExtElt->getValueType(0);
46079 EVT VecVT = Vec.getValueType();
46080
46081 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46082 // non-zero element because the shuffle+scalar op will be cheaper?
46083 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46084 return SDValue();
46085
46086 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46087 // extract, the condition code), so deal with those as a special-case.
46088 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46089 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46090 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46091 return SDValue();
46092
46093 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46094 SDLoc DL(ExtElt);
46095 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46096 Vec.getOperand(0), Index);
46097 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46098 Vec.getOperand(1), Index);
46099 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46100 }
46101
46102 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46103 VT != MVT::f64)
46104 return SDValue();
46105
46106 // Vector FP selects don't fit the pattern of FP math ops (because the
46107 // condition has a different type and we have to change the opcode), so deal
46108 // with those here.
46109 // FIXME: This is restricted to pre type legalization. If we loosen this we
46110 // need to convert vector bool to a scalar bool.
46111 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46112 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46113 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
46114 assert(Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
46115 "Unexpected cond type for combine");
46116 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46117 SDLoc DL(ExtElt);
46120 Vec.getOperand(0), Index);
46121 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46122 Vec.getOperand(1), Index);
46123 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46124 Vec.getOperand(2), Index);
46125 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46126 }
46127
46128 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46129 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46130 // missed load folding and fma+fneg combining.
46131 switch (Vec.getOpcode()) {
46132 case ISD::FMA: // Begin 3 operands
46133 case ISD::FMAD:
46134 case ISD::FADD: // Begin 2 operands
46135 case ISD::FSUB:
46136 case ISD::FMUL:
46137 case ISD::FDIV:
46138 case ISD::FREM:
46139 case ISD::FCOPYSIGN:
46140 case ISD::FMINNUM:
46141 case ISD::FMAXNUM:
46142 case ISD::FMINNUM_IEEE:
46143 case ISD::FMAXNUM_IEEE:
46144 case ISD::FMAXIMUM:
46145 case ISD::FMINIMUM:
46146 case ISD::FMAXIMUMNUM:
46147 case ISD::FMINIMUMNUM:
46148 case X86ISD::FMAX:
46149 case X86ISD::FMIN:
46150 case ISD::FABS: // Begin 1 operand
46151 case ISD::FSQRT:
46152 case ISD::FRINT:
46153 case ISD::FCEIL:
46154 case ISD::FTRUNC:
46155 case ISD::FNEARBYINT:
46156 case ISD::FROUNDEVEN:
46157 case ISD::FROUND:
46158 case ISD::FFLOOR:
46159 case X86ISD::FRCP:
46160 case X86ISD::FRSQRT: {
46161 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46162 SDLoc DL(ExtElt);
46164 for (SDValue Op : Vec->ops())
46165 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46166 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46167 }
46168 default:
46169 return SDValue();
46170 }
46171 llvm_unreachable("All opcodes should return within switch");
46172}
46173
46174/// Try to convert a vector reduction sequence composed of binops and shuffles
46175/// into horizontal ops.
46177 const X86Subtarget &Subtarget) {
46178 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46179
46180 // We need at least SSE2 to anything here.
46181 if (!Subtarget.hasSSE2())
46182 return SDValue();
46183
46184 ISD::NodeType Opc;
46185 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46186 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46187 if (!Rdx)
46188 return SDValue();
46189
46190 SDValue Index = ExtElt->getOperand(1);
46191 assert(isNullConstant(Index) &&
46192 "Reduction doesn't end in an extract from index 0");
46193
46194 EVT VT = ExtElt->getValueType(0);
46195 EVT VecVT = Rdx.getValueType();
46196 if (VecVT.getScalarType() != VT)
46197 return SDValue();
46198
46199 SDLoc DL(ExtElt);
46200 unsigned NumElts = VecVT.getVectorNumElements();
46201 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46202
46203 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46204 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46205 if (V.getValueType() == MVT::v4i8) {
46206 if (ZeroExtend && Subtarget.hasSSE41()) {
46207 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46208 DAG.getConstant(0, DL, MVT::v4i32),
46209 DAG.getBitcast(MVT::i32, V),
46210 DAG.getVectorIdxConstant(0, DL));
46211 return DAG.getBitcast(MVT::v16i8, V);
46212 }
46213 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46214 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46215 : DAG.getUNDEF(MVT::v4i8));
46216 }
46217 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46218 DAG.getUNDEF(MVT::v8i8));
46219 };
46220
46221 // vXi8 mul reduction - promote to vXi16 mul reduction.
46222 if (Opc == ISD::MUL) {
46223 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46224 return SDValue();
46225 if (VecVT.getSizeInBits() >= 128) {
46226 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46227 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46228 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46229 Lo = DAG.getBitcast(WideVT, Lo);
46230 Hi = DAG.getBitcast(WideVT, Hi);
46231 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46232 while (Rdx.getValueSizeInBits() > 128) {
46233 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46234 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46235 }
46236 } else {
46237 Rdx = WidenToV16I8(Rdx, false);
46238 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46239 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46240 }
46241 if (NumElts >= 8)
46242 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46243 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46244 {4, 5, 6, 7, -1, -1, -1, -1}));
46245 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46246 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46247 {2, 3, -1, -1, -1, -1, -1, -1}));
46248 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46249 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46250 {1, -1, -1, -1, -1, -1, -1, -1}));
46251 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46252 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46253 }
46254
46255 // vXi8 add reduction - sub 128-bit vector.
46256 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
46257 Rdx = WidenToV16I8(Rdx, true);
46258 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46259 DAG.getConstant(0, DL, MVT::v16i8));
46260 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46261 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46262 }
46263
46264 // Must be a >=128-bit vector with pow2 elements.
46265 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
46266 return SDValue();
46267
46268 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
46269 if (VT == MVT::i8) {
46270 while (Rdx.getValueSizeInBits() > 128) {
46271 SDValue Lo, Hi;
46272 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46273 VecVT = Lo.getValueType();
46274 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46275 }
46276 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
46277
46279 MVT::v16i8, DL, Rdx, Rdx,
46280 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
46281 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
46282 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46283 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
46284 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46285 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46286 }
46287
46288 // See if we can use vXi8 PSADBW add reduction for larger zext types.
46289 // If the source vector values are 0-255, then we can use PSADBW to
46290 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
46291 // TODO: See if its worth avoiding vXi16/i32 truncations?
46292 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
46293 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
46294 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
46295 Subtarget.hasAVX512())) {
46296 if (Rdx.getValueType() == MVT::v8i16) {
46297 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
46298 DAG.getUNDEF(MVT::v8i16));
46299 } else {
46300 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
46301 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
46302 if (ByteVT.getSizeInBits() < 128)
46303 Rdx = WidenToV16I8(Rdx, true);
46304 }
46305
46306 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46307 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46308 ArrayRef<SDValue> Ops) {
46309 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46310 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
46311 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
46312 };
46313 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
46314 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
46315
46316 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
46317 while (Rdx.getValueSizeInBits() > 128) {
46318 SDValue Lo, Hi;
46319 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46320 VecVT = Lo.getValueType();
46321 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46322 }
46323 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
46324
46325 if (NumElts > 8) {
46326 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
46327 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
46328 }
46329
46330 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
46331 Rdx = DAG.getBitcast(VecVT, Rdx);
46332 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46333 }
46334
46335 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
46336 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
46337 return SDValue();
46338
46339 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
46340
46341 // 256-bit horizontal instructions operate on 128-bit chunks rather than
46342 // across the whole vector, so we need an extract + hop preliminary stage.
46343 // This is the only step where the operands of the hop are not the same value.
46344 // TODO: We could extend this to handle 512-bit or even longer vectors.
46345 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
46346 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
46347 unsigned NumElts = VecVT.getVectorNumElements();
46348 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
46349 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
46350 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
46351 VecVT = Rdx.getValueType();
46352 }
46353 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
46354 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
46355 return SDValue();
46356
46357 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46358 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
46359 for (unsigned i = 0; i != ReductionSteps; ++i)
46360 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
46361
46362 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46363}
46364
46365/// Detect vector gather/scatter index generation and convert it from being a
46366/// bunch of shuffles and extracts into a somewhat faster sequence.
46367/// For i686, the best sequence is apparently storing the value and loading
46368/// scalars back, while for x64 we should use 64-bit extracts and shifts.
46371 const X86Subtarget &Subtarget) {
46372 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
46373 return NewOp;
46374
46375 SDValue InputVector = N->getOperand(0);
46376 SDValue EltIdx = N->getOperand(1);
46377 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
46378
46379 EVT SrcVT = InputVector.getValueType();
46380 EVT VT = N->getValueType(0);
46381 SDLoc dl(InputVector);
46382 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46383 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46384 unsigned NumEltBits = VT.getScalarSizeInBits();
46385 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46386
46387 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46388 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46389
46390 // Integer Constant Folding.
46391 if (CIdx && VT.isInteger()) {
46392 APInt UndefVecElts;
46393 SmallVector<APInt, 16> EltBits;
46394 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
46395 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
46396 EltBits, /*AllowWholeUndefs*/ true,
46397 /*AllowPartialUndefs*/ false)) {
46398 uint64_t Idx = CIdx->getZExtValue();
46399 if (UndefVecElts[Idx])
46400 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46401 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
46402 }
46403
46404 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46405 // Improves lowering of bool masks on rust which splits them into byte array.
46406 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
46407 SDValue Src = peekThroughBitcasts(InputVector);
46408 if (Src.getValueType().getScalarType() == MVT::i1 &&
46409 TLI.isTypeLegal(Src.getValueType())) {
46410 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
46411 SDValue Sub = DAG.getNode(
46412 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
46413 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
46414 return DAG.getBitcast(VT, Sub);
46415 }
46416 }
46417 }
46418
46419 if (IsPextr) {
46420 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
46421 DCI))
46422 return SDValue(N, 0);
46423
46424 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46425 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
46426 InputVector.getOpcode() == X86ISD::PINSRW) &&
46427 InputVector.getOperand(2) == EltIdx) {
46428 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
46429 "Vector type mismatch");
46430 SDValue Scl = InputVector.getOperand(1);
46431 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
46432 return DAG.getZExtOrTrunc(Scl, dl, VT);
46433 }
46434
46435 // TODO - Remove this once we can handle the implicit zero-extension of
46436 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
46437 // combineBasicSADPattern.
46438 return SDValue();
46439 }
46440
46441 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
46442 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
46443 InputVector.getOpcode() == ISD::BITCAST &&
46444 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46445 isNullConstant(EltIdx) && InputVector.hasOneUse())
46446 return DAG.getBitcast(VT, InputVector);
46447
46448 // Detect mmx to i32 conversion through a v2i32 elt extract.
46449 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
46450 InputVector.getOpcode() == ISD::BITCAST &&
46451 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46452 isNullConstant(EltIdx) && InputVector.hasOneUse())
46453 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
46454 InputVector.getOperand(0));
46455
46456 // Check whether this extract is the root of a sum of absolute differences
46457 // pattern. This has to be done here because we really want it to happen
46458 // pre-legalization,
46459 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
46460 return SAD;
46461
46462 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
46463 return VPDPBUSD;
46464
46465 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
46466 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
46467 return Cmp;
46468
46469 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
46470 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
46471 return MinMax;
46472
46473 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
46474 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
46475 return V;
46476
46477 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
46478 return V;
46479
46480 if (CIdx)
46482 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
46483 dl, DAG, DCI))
46484 return V;
46485
46486 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
46487 // and then testing the relevant element.
46488 //
46489 // Note that we only combine extracts on the *same* result number, i.e.
46490 // t0 = merge_values a0, a1, a2, a3
46491 // i1 = extract_vector_elt t0, Constant:i64<2>
46492 // i1 = extract_vector_elt t0, Constant:i64<3>
46493 // but not
46494 // i1 = extract_vector_elt t0:1, Constant:i64<2>
46495 // since the latter would need its own MOVMSK.
46496 if (SrcVT.getScalarType() == MVT::i1) {
46497 bool IsVar = !CIdx;
46498 SmallVector<SDNode *, 16> BoolExtracts;
46499 unsigned ResNo = InputVector.getResNo();
46500 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
46501 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46502 Use->getOperand(0).getResNo() == ResNo &&
46503 Use->getValueType(0) == MVT::i1) {
46504 BoolExtracts.push_back(Use);
46505 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46506 return true;
46507 }
46508 return false;
46509 };
46510 // TODO: Can we drop the oneuse check for constant extracts?
46511 if (all_of(InputVector->users(), IsBoolExtract) &&
46512 (IsVar || BoolExtracts.size() > 1)) {
46513 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
46514 if (SDValue BC =
46515 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
46516 for (SDNode *Use : BoolExtracts) {
46517 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46518 // Mask = 1 << MaskIdx
46519 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46520 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
46521 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
46522 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
46523 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
46524 DCI.CombineTo(Use, Res);
46525 }
46526 return SDValue(N, 0);
46527 }
46528 }
46529 }
46530
46531 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
46532 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
46533 SDValue TruncSrc = InputVector.getOperand(0);
46534 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
46535 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
46536 SDValue NewExt =
46537 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
46538 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
46539 }
46540 }
46541
46542 return SDValue();
46543}
46544
46545// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
46546// This is more or less the reverse of combineBitcastvxi1.
46548 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
46549 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
46550 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
46551 Opcode != ISD::ANY_EXTEND)
46552 return SDValue();
46553 if (!DCI.isBeforeLegalizeOps())
46554 return SDValue();
46555 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46556 return SDValue();
46557
46558 EVT SVT = VT.getScalarType();
46559 EVT InSVT = N0.getValueType().getScalarType();
46560 unsigned EltSizeInBits = SVT.getSizeInBits();
46561
46562 // Input type must be extending a bool vector (bit-casted from a scalar
46563 // integer) to legal integer types.
46564 if (!VT.isVector())
46565 return SDValue();
46566 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
46567 return SDValue();
46568 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
46569 return SDValue();
46570
46571 SDValue N00 = N0.getOperand(0);
46572 EVT SclVT = N00.getValueType();
46573 if (!SclVT.isScalarInteger())
46574 return SDValue();
46575
46576 SDValue Vec;
46577 SmallVector<int> ShuffleMask;
46578 unsigned NumElts = VT.getVectorNumElements();
46579 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
46580
46581 // Broadcast the scalar integer to the vector elements.
46582 if (NumElts > EltSizeInBits) {
46583 // If the scalar integer is greater than the vector element size, then we
46584 // must split it down into sub-sections for broadcasting. For example:
46585 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46586 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46587 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
46588 unsigned Scale = NumElts / EltSizeInBits;
46589 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46590 bool UseBroadcast = Subtarget.hasInt256() &&
46591 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
46592 Vec = UseBroadcast
46593 ? DAG.getSplat(BroadcastVT, DL, N00)
46594 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46595 Vec = DAG.getBitcast(VT, Vec);
46596
46597 for (unsigned i = 0; i != Scale; ++i) {
46598 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
46599 ShuffleMask.append(EltSizeInBits, i + Offset);
46600 }
46601 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46602 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46603 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46604 // If we have register broadcast instructions, use the scalar size as the
46605 // element type for the shuffle. Then cast to the wider element type. The
46606 // widened bits won't be used, and this might allow the use of a broadcast
46607 // load.
46608 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
46609 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
46610 (NumElts * EltSizeInBits) / NumElts);
46611 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
46612 } else {
46613 // For smaller scalar integers, we can simply any-extend it to the vector
46614 // element size (we don't care about the upper bits) and broadcast it to all
46615 // elements.
46616 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
46617 }
46618
46619 // Now, mask the relevant bit in each element.
46621 for (unsigned i = 0; i != NumElts; ++i) {
46622 int BitIdx = (i % EltSizeInBits);
46623 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46624 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46625 }
46626 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46627 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46628
46629 // Compare against the bitmask and extend the result.
46630 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
46631 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46632 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46633
46634 // For SEXT, this is now done, otherwise shift the result down for
46635 // zero-extension.
46636 if (Opcode == ISD::SIGN_EXTEND)
46637 return Vec;
46638 return DAG.getNode(ISD::SRL, DL, VT, Vec,
46639 DAG.getConstant(EltSizeInBits - 1, DL, VT));
46640}
46641
46642/// If a vector select has an operand that is -1 or 0, try to simplify the
46643/// select to a bitwise logic operation.
46644/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46645static SDValue
46648 const X86Subtarget &Subtarget) {
46649 SDValue Cond = N->getOperand(0);
46650 SDValue LHS = N->getOperand(1);
46651 SDValue RHS = N->getOperand(2);
46652 EVT VT = LHS.getValueType();
46653 EVT CondVT = Cond.getValueType();
46654 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46655
46656 if (N->getOpcode() != ISD::VSELECT)
46657 return SDValue();
46658
46659 assert(CondVT.isVector() && "Vector select expects a vector selector!");
46660
46661 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46662 // TODO: Can we assert that both operands are not zeros (because that should
46663 // get simplified at node creation time)?
46664 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46665 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46666
46667 // If both inputs are 0/undef, create a complete zero vector.
46668 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46669 if (TValIsAllZeros && FValIsAllZeros) {
46670 if (VT.isFloatingPoint())
46671 return DAG.getConstantFP(0.0, DL, VT);
46672 return DAG.getConstant(0, DL, VT);
46673 }
46674
46675 // To use the condition operand as a bitwise mask, it must have elements that
46676 // are the same size as the select elements. Ie, the condition operand must
46677 // have already been promoted from the IR select condition type <N x i1>.
46678 // Don't check if the types themselves are equal because that excludes
46679 // vector floating-point selects.
46680 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46681 return SDValue();
46682
46683 // Try to invert the condition if true value is not all 1s and false value is
46684 // not all 0s. Only do this if the condition has one use.
46685 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46686 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46687 // Check if the selector will be produced by CMPP*/PCMP*.
46688 Cond.getOpcode() == ISD::SETCC &&
46689 // Check if SETCC has already been promoted.
46690 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46691 CondVT) {
46692 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46693
46694 if (TValIsAllZeros || FValIsAllOnes) {
46695 SDValue CC = Cond.getOperand(2);
46697 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46698 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46699 NewCC);
46700 std::swap(LHS, RHS);
46701 TValIsAllOnes = FValIsAllOnes;
46702 FValIsAllZeros = TValIsAllZeros;
46703 }
46704 }
46705
46706 // Cond value must be 'sign splat' to be converted to a logical op.
46707 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46708 return SDValue();
46709
46710 // vselect Cond, 111..., 000... -> Cond
46711 if (TValIsAllOnes && FValIsAllZeros)
46712 return DAG.getBitcast(VT, Cond);
46713
46714 if (!TLI.isTypeLegal(CondVT))
46715 return SDValue();
46716
46717 // vselect Cond, 111..., X -> or Cond, X
46718 if (TValIsAllOnes) {
46719 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46720 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46721 return DAG.getBitcast(VT, Or);
46722 }
46723
46724 // vselect Cond, X, 000... -> and Cond, X
46725 if (FValIsAllZeros) {
46726 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46727 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46728 return DAG.getBitcast(VT, And);
46729 }
46730
46731 // vselect Cond, 000..., X -> andn Cond, X
46732 if (TValIsAllZeros) {
46733 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46734 SDValue AndN;
46735 // The canonical form differs for i1 vectors - x86andnp is not used
46736 if (CondVT.getScalarType() == MVT::i1)
46737 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46738 CastRHS);
46739 else
46740 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46741 return DAG.getBitcast(VT, AndN);
46742 }
46743
46744 return SDValue();
46745}
46746
46747/// If both arms of a vector select are concatenated vectors, split the select,
46748/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46749/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
46750/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46752 const X86Subtarget &Subtarget) {
46753 unsigned Opcode = N->getOpcode();
46754 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46755 return SDValue();
46756
46757 // TODO: Split 512-bit vectors too?
46758 EVT VT = N->getValueType(0);
46759 if (!VT.is256BitVector())
46760 return SDValue();
46761
46762 // TODO: Split as long as any 2 of the 3 operands are concatenated?
46763 SDValue Cond = N->getOperand(0);
46764 SDValue TVal = N->getOperand(1);
46765 SDValue FVal = N->getOperand(2);
46766 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46767 !isFreeToSplitVector(TVal.getNode(), DAG) ||
46768 !isFreeToSplitVector(FVal.getNode(), DAG))
46769 return SDValue();
46770
46771 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46772 ArrayRef<SDValue> Ops) {
46773 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46774 };
46775 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
46776 /*CheckBWI*/ false);
46777}
46778
46780 const SDLoc &DL) {
46781 SDValue Cond = N->getOperand(0);
46782 SDValue LHS = N->getOperand(1);
46783 SDValue RHS = N->getOperand(2);
46784
46785 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46786 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46787 if (!TrueC || !FalseC)
46788 return SDValue();
46789
46790 // Don't do this for crazy integer types.
46791 EVT VT = N->getValueType(0);
46792 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46793 return SDValue();
46794
46795 // We're going to use the condition bit in math or logic ops. We could allow
46796 // this with a wider condition value (post-legalization it becomes an i8),
46797 // but if nothing is creating selects that late, it doesn't matter.
46798 if (Cond.getValueType() != MVT::i1)
46799 return SDValue();
46800
46801 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46802 // 3, 5, or 9 with i32/i64, so those get transformed too.
46803 // TODO: For constants that overflow or do not differ by power-of-2 or small
46804 // multiplier, convert to 'and' + 'add'.
46805 const APInt &TrueVal = TrueC->getAPIntValue();
46806 const APInt &FalseVal = FalseC->getAPIntValue();
46807
46808 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46809 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46810 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46811 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46812 if (CC == ISD::SETEQ || CC == ISD::SETNE)
46813 return SDValue();
46814 }
46815
46816 bool OV;
46817 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46818 if (OV)
46819 return SDValue();
46820
46821 APInt AbsDiff = Diff.abs();
46822 if (AbsDiff.isPowerOf2() ||
46823 ((VT == MVT::i32 || VT == MVT::i64) &&
46824 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46825
46826 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46827 // of the condition can usually be folded into a compare predicate, but even
46828 // without that, the sequence should be cheaper than a CMOV alternative.
46829 if (TrueVal.slt(FalseVal)) {
46830 Cond = DAG.getNOT(DL, Cond, MVT::i1);
46831 std::swap(TrueC, FalseC);
46832 }
46833
46834 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46835 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46836
46837 // Multiply condition by the difference if non-one.
46838 if (!AbsDiff.isOne())
46839 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46840
46841 // Add the base if non-zero.
46842 if (!FalseC->isZero())
46843 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46844
46845 return R;
46846 }
46847
46848 return SDValue();
46849}
46850
46851/// If this is a *dynamic* select (non-constant condition) and we can match
46852/// this node with one of the variable blend instructions, restructure the
46853/// condition so that blends can use the high (sign) bit of each element.
46854/// This function will also call SimplifyDemandedBits on already created
46855/// BLENDV to perform additional simplifications.
46857 const SDLoc &DL,
46859 const X86Subtarget &Subtarget) {
46860 SDValue Cond = N->getOperand(0);
46861 if ((N->getOpcode() != ISD::VSELECT &&
46862 N->getOpcode() != X86ISD::BLENDV) ||
46864 return SDValue();
46865
46866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46867 unsigned BitWidth = Cond.getScalarValueSizeInBits();
46868 EVT VT = N->getValueType(0);
46869
46870 // We can only handle the cases where VSELECT is directly legal on the
46871 // subtarget. We custom lower VSELECT nodes with constant conditions and
46872 // this makes it hard to see whether a dynamic VSELECT will correctly
46873 // lower, so we both check the operation's status and explicitly handle the
46874 // cases where a *dynamic* blend will fail even though a constant-condition
46875 // blend could be custom lowered.
46876 // FIXME: We should find a better way to handle this class of problems.
46877 // Potentially, we should combine constant-condition vselect nodes
46878 // pre-legalization into shuffles and not mark as many types as custom
46879 // lowered.
46881 return SDValue();
46882 // FIXME: We don't support i16-element blends currently. We could and
46883 // should support them by making *all* the bits in the condition be set
46884 // rather than just the high bit and using an i8-element blend.
46885 if (VT.getVectorElementType() == MVT::i16)
46886 return SDValue();
46887 // Dynamic blending was only available from SSE4.1 onward.
46888 if (VT.is128BitVector() && !Subtarget.hasSSE41())
46889 return SDValue();
46890 // Byte blends are only available in AVX2
46891 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46892 return SDValue();
46893 // There are no 512-bit blend instructions that use sign bits.
46894 if (VT.is512BitVector())
46895 return SDValue();
46896
46897 // Don't optimize before the condition has been transformed to a legal type
46898 // and don't ever optimize vector selects that map to AVX512 mask-registers.
46899 if (BitWidth < 8 || BitWidth > 64)
46900 return SDValue();
46901
46902 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46903 for (SDUse &Use : Cond->uses())
46904 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
46905 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
46906 Use.getOperandNo() != 0)
46907 return false;
46908
46909 return true;
46910 };
46911
46913
46914 if (OnlyUsedAsSelectCond(Cond)) {
46915 KnownBits Known;
46917 !DCI.isBeforeLegalizeOps());
46918 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46919 return SDValue();
46920
46921 // If we changed the computation somewhere in the DAG, this change will
46922 // affect all users of Cond. Update all the nodes so that we do not use
46923 // the generic VSELECT anymore. Otherwise, we may perform wrong
46924 // optimizations as we messed with the actual expectation for the vector
46925 // boolean values.
46926 for (SDNode *U : Cond->users()) {
46927 if (U->getOpcode() == X86ISD::BLENDV)
46928 continue;
46929
46930 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46931 Cond, U->getOperand(1), U->getOperand(2));
46932 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46933 DCI.AddToWorklist(U);
46934 }
46935 DCI.CommitTargetLoweringOpt(TLO);
46936 return SDValue(N, 0);
46937 }
46938
46939 // Otherwise we can still at least try to simplify multiple use bits.
46941 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
46942 N->getOperand(1), N->getOperand(2));
46943
46944 return SDValue();
46945}
46946
46947// Try to match:
46948// (or (and (M, (sub 0, X)), (pandn M, X)))
46949// which is a special case of:
46950// (select M, (sub 0, X), X)
46951// Per:
46952// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
46953// We know that, if fNegate is 0 or 1:
46954// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46955//
46956// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
46957// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46958// ( M ? -X : X) == ((X ^ M ) + (M & 1))
46959// This lets us transform our vselect to:
46960// (add (xor X, M), (and M, 1))
46961// And further to:
46962// (sub (xor X, M), M)
46964 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
46965 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46966 EVT MaskVT = Mask.getValueType();
46967 assert(MaskVT.isInteger() &&
46968 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
46969 "Mask must be zero/all-bits");
46970
46971 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
46972 return SDValue();
46974 return SDValue();
46975
46976 auto IsNegV = [](SDNode *N, SDValue V) {
46977 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46978 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46979 };
46980
46981 SDValue V;
46982 if (IsNegV(Y.getNode(), X))
46983 V = X;
46984 else if (IsNegV(X.getNode(), Y))
46985 V = Y;
46986 else
46987 return SDValue();
46988
46989 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
46990 SDValue SubOp2 = Mask;
46991
46992 // If the negate was on the false side of the select, then
46993 // the operands of the SUB need to be swapped. PR 27251.
46994 // This is because the pattern being matched above is
46995 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
46996 // but if the pattern matched was
46997 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
46998 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
46999 // pattern also needs to be a negation of the replacement pattern above.
47000 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47001 // sub accomplishes the negation of the replacement pattern.
47002 if (V == Y)
47003 std::swap(SubOp1, SubOp2);
47004
47005 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47006 return DAG.getBitcast(VT, Res);
47007}
47008
47010 const X86Subtarget &Subtarget) {
47011 if (!Subtarget.hasAVX512())
47012 return SDValue();
47013 if (N->getOpcode() != ISD::VSELECT)
47014 return SDValue();
47015
47016 SDValue Cond = N->getOperand(0);
47017 SDValue LHS = N->getOperand(1);
47018 SDValue RHS = N->getOperand(2);
47019
47020 if (canCombineAsMaskOperation(LHS, Subtarget))
47021 return SDValue();
47022
47023 if (!canCombineAsMaskOperation(RHS, Subtarget))
47024 return SDValue();
47025
47026 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
47027 return SDValue();
47028
47029 // Commute LHS and RHS to create opportunity to select mask instruction.
47030 // (vselect M, L, R) -> (vselect ~M, R, L)
47031 ISD::CondCode NewCC =
47032 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
47033 Cond.getOperand(0).getValueType());
47034 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
47035 Cond.getOperand(1), NewCC);
47036 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47037}
47038
47039/// Do target-specific dag combines on SELECT and VSELECT nodes.
47042 const X86Subtarget &Subtarget) {
47043 SDLoc DL(N);
47044 SDValue Cond = N->getOperand(0);
47045 SDValue LHS = N->getOperand(1);
47046 SDValue RHS = N->getOperand(2);
47047
47048 // Try simplification again because we use this function to optimize
47049 // BLENDV nodes that are not handled by the generic combiner.
47050 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47051 return V;
47052
47053 // When avx512 is available the lhs operand of select instruction can be
47054 // folded with mask instruction, while the rhs operand can't. Commute the
47055 // lhs and rhs of the select instruction to create the opportunity of
47056 // folding.
47057 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47058 return V;
47059
47060 EVT VT = LHS.getValueType();
47061 EVT CondVT = Cond.getValueType();
47062 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47063 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47064
47065 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47066 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47067 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47068 if (CondVT.isVector() && CondVT.isInteger() &&
47069 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47070 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47073 DL, DAG, Subtarget))
47074 return V;
47075
47076 // Convert vselects with constant condition into shuffles.
47077 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
47078 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
47081 N->getOpcode() == X86ISD::BLENDV))
47082 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
47083 }
47084
47085 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47086 // by forcing the unselected elements to zero.
47087 // TODO: Can we handle more shuffles with this?
47088 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
47089 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
47090 LHS.hasOneUse() && RHS.hasOneUse()) {
47091 MVT SimpleVT = VT.getSimpleVT();
47092 SmallVector<SDValue, 1> LHSOps, RHSOps;
47093 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
47094 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
47095 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
47096 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
47097 int NumElts = VT.getVectorNumElements();
47098 for (int i = 0; i != NumElts; ++i) {
47099 // getConstVector sets negative shuffle mask values as undef, so ensure
47100 // we hardcode SM_SentinelZero values to zero (0x80).
47101 if (CondMask[i] < NumElts) {
47102 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
47103 RHSMask[i] = 0x80;
47104 } else {
47105 LHSMask[i] = 0x80;
47106 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
47107 }
47108 }
47109 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
47110 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
47111 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
47112 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
47113 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
47114 }
47115 }
47116
47117 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47118 // instructions match the semantics of the common C idiom x<y?x:y but not
47119 // x<=y?x:y, because of how they handle negative zero (which can be
47120 // ignored in unsafe-math mode).
47121 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47122 if ((Cond.getOpcode() == ISD::SETCC ||
47123 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47124 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47125 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47126 (Subtarget.hasSSE2() ||
47127 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47128 bool IsStrict = Cond->isStrictFPOpcode();
47130 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47131 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47132 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47133
47134 unsigned Opcode = 0;
47135 // Check for x CC y ? x : y.
47136 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47137 switch (CC) {
47138 default: break;
47139 case ISD::SETULT:
47140 // Converting this to a min would handle NaNs incorrectly, and swapping
47141 // the operands would cause it to handle comparisons between positive
47142 // and negative zero incorrectly.
47143 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47145 !(DAG.isKnownNeverZeroFloat(LHS) ||
47147 break;
47148 std::swap(LHS, RHS);
47149 }
47150 Opcode = X86ISD::FMIN;
47151 break;
47152 case ISD::SETOLE:
47153 // Converting this to a min would handle comparisons between positive
47154 // and negative zero incorrectly.
47157 break;
47158 Opcode = X86ISD::FMIN;
47159 break;
47160 case ISD::SETULE:
47161 // Converting this to a min would handle both negative zeros and NaNs
47162 // incorrectly, but we can swap the operands to fix both.
47163 std::swap(LHS, RHS);
47164 [[fallthrough]];
47165 case ISD::SETOLT:
47166 case ISD::SETLT:
47167 case ISD::SETLE:
47168 Opcode = X86ISD::FMIN;
47169 break;
47170
47171 case ISD::SETOGE:
47172 // Converting this to a max would handle comparisons between positive
47173 // and negative zero incorrectly.
47176 break;
47177 Opcode = X86ISD::FMAX;
47178 break;
47179 case ISD::SETUGT:
47180 // Converting this to a max would handle NaNs incorrectly, and swapping
47181 // the operands would cause it to handle comparisons between positive
47182 // and negative zero incorrectly.
47183 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47185 !(DAG.isKnownNeverZeroFloat(LHS) ||
47187 break;
47188 std::swap(LHS, RHS);
47189 }
47190 Opcode = X86ISD::FMAX;
47191 break;
47192 case ISD::SETUGE:
47193 // Converting this to a max would handle both negative zeros and NaNs
47194 // incorrectly, but we can swap the operands to fix both.
47195 std::swap(LHS, RHS);
47196 [[fallthrough]];
47197 case ISD::SETOGT:
47198 case ISD::SETGT:
47199 case ISD::SETGE:
47200 Opcode = X86ISD::FMAX;
47201 break;
47202 }
47203 // Check for x CC y ? y : x -- a min/max with reversed arms.
47204 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47205 switch (CC) {
47206 default: break;
47207 case ISD::SETOGE:
47208 // Converting this to a min would handle comparisons between positive
47209 // and negative zero incorrectly, and swapping the operands would
47210 // cause it to handle NaNs incorrectly.
47212 !(DAG.isKnownNeverZeroFloat(LHS) ||
47213 DAG.isKnownNeverZeroFloat(RHS))) {
47214 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47215 break;
47216 std::swap(LHS, RHS);
47217 }
47218 Opcode = X86ISD::FMIN;
47219 break;
47220 case ISD::SETUGT:
47221 // Converting this to a min would handle NaNs incorrectly.
47222 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47223 break;
47224 Opcode = X86ISD::FMIN;
47225 break;
47226 case ISD::SETUGE:
47227 // Converting this to a min would handle both negative zeros and NaNs
47228 // incorrectly, but we can swap the operands to fix both.
47229 std::swap(LHS, RHS);
47230 [[fallthrough]];
47231 case ISD::SETOGT:
47232 case ISD::SETGT:
47233 case ISD::SETGE:
47234 Opcode = X86ISD::FMIN;
47235 break;
47236
47237 case ISD::SETULT:
47238 // Converting this to a max would handle NaNs incorrectly.
47239 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47240 break;
47241 Opcode = X86ISD::FMAX;
47242 break;
47243 case ISD::SETOLE:
47244 // Converting this to a max would handle comparisons between positive
47245 // and negative zero incorrectly, and swapping the operands would
47246 // cause it to handle NaNs incorrectly.
47248 !DAG.isKnownNeverZeroFloat(LHS) &&
47249 !DAG.isKnownNeverZeroFloat(RHS)) {
47250 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47251 break;
47252 std::swap(LHS, RHS);
47253 }
47254 Opcode = X86ISD::FMAX;
47255 break;
47256 case ISD::SETULE:
47257 // Converting this to a max would handle both negative zeros and NaNs
47258 // incorrectly, but we can swap the operands to fix both.
47259 std::swap(LHS, RHS);
47260 [[fallthrough]];
47261 case ISD::SETOLT:
47262 case ISD::SETLT:
47263 case ISD::SETLE:
47264 Opcode = X86ISD::FMAX;
47265 break;
47266 }
47267 }
47268
47269 if (Opcode) {
47270 if (IsStrict) {
47271 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47273 DL, {N->getValueType(0), MVT::Other},
47274 {Cond.getOperand(0), LHS, RHS});
47275 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47276 return Ret;
47277 }
47278 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47279 }
47280 }
47281
47282 // Some mask scalar intrinsics rely on checking if only one bit is set
47283 // and implement it in C code like this:
47284 // A[0] = (U & 1) ? A[0] : W[0];
47285 // This creates some redundant instructions that break pattern matching.
47286 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47287 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47288 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47289 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47290 SDValue AndNode = Cond.getOperand(0);
47291 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47292 isNullConstant(Cond.getOperand(1)) &&
47293 isOneConstant(AndNode.getOperand(1))) {
47294 // LHS and RHS swapped due to
47295 // setcc outputting 1 when AND resulted in 0 and vice versa.
47296 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47297 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47298 }
47299 }
47300
47301 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47302 // lowering on KNL. In this case we convert it to
47303 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47304 // The same situation all vectors of i8 and i16 without BWI.
47305 // Make sure we extend these even before type legalization gets a chance to
47306 // split wide vectors.
47307 // Since SKX these selects have a proper lowering.
47308 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47309 CondVT.getVectorElementType() == MVT::i1 &&
47310 (VT.getVectorElementType() == MVT::i8 ||
47311 VT.getVectorElementType() == MVT::i16)) {
47312 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47313 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47314 }
47315
47316 // AVX512 - Extend select to merge with target shuffle.
47317 // select(mask, extract_subvector(shuffle(x)), y) -->
47318 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47319 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47320 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47321 CondVT.getVectorElementType() == MVT::i1) {
47322 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47323 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47324 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47325 isNullConstant(Op.getOperand(1)) &&
47326 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47327 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47328 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47329 ISD::isBuildVectorAllZeros(Alt.getNode()));
47330 };
47331
47332 bool SelectableLHS = SelectableOp(LHS, RHS);
47333 bool SelectableRHS = SelectableOp(RHS, LHS);
47334 if (SelectableLHS || SelectableRHS) {
47335 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47336 : RHS.getOperand(0).getValueType();
47337 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47338 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47339 VT.getSizeInBits());
47340 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47341 VT.getSizeInBits());
47342 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47343 DAG.getUNDEF(SrcCondVT), Cond,
47344 DAG.getVectorIdxConstant(0, DL));
47345 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47346 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47347 }
47348 }
47349
47350 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
47351 return V;
47352
47353 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47354 Cond.hasOneUse()) {
47355 EVT CondVT = Cond.getValueType();
47356 SDValue Cond0 = Cond.getOperand(0);
47357 SDValue Cond1 = Cond.getOperand(1);
47358 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47359
47360 // Canonicalize min/max:
47361 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47362 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47363 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47364 // the need for an extra compare against zero. e.g.
47365 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47366 // subl %esi, %edi
47367 // testl %edi, %edi
47368 // movl $0, %eax
47369 // cmovgl %edi, %eax
47370 // =>
47371 // xorl %eax, %eax
47372 // subl %esi, $edi
47373 // cmovsl %eax, %edi
47374 //
47375 // We can also canonicalize
47376 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
47377 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
47378 // This allows the use of a test instruction for the compare.
47379 if (LHS == Cond0 && RHS == Cond1) {
47380 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
47383 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47384 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47385 }
47386 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
47387 ISD::CondCode NewCC = ISD::SETUGE;
47388 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47389 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47390 }
47391 }
47392
47393 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
47394 // fold eq + gt/lt nested selects into ge/le selects
47395 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
47396 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47397 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
47398 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47399 // .. etc ..
47400 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
47401 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
47402 SDValue InnerSetCC = RHS.getOperand(0);
47403 ISD::CondCode InnerCC =
47404 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47405 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
47406 Cond0 == InnerSetCC.getOperand(0) &&
47407 Cond1 == InnerSetCC.getOperand(1)) {
47408 ISD::CondCode NewCC;
47409 switch (CC == ISD::SETEQ ? InnerCC : CC) {
47410 // clang-format off
47411 case ISD::SETGT: NewCC = ISD::SETGE; break;
47412 case ISD::SETLT: NewCC = ISD::SETLE; break;
47413 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
47414 case ISD::SETULT: NewCC = ISD::SETULE; break;
47415 default: NewCC = ISD::SETCC_INVALID; break;
47416 // clang-format on
47417 }
47418 if (NewCC != ISD::SETCC_INVALID) {
47419 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
47420 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
47421 }
47422 }
47423 }
47424 }
47425
47426 // Check if the first operand is all zeros and Cond type is vXi1.
47427 // If this an avx512 target we can improve the use of zero masking by
47428 // swapping the operands and inverting the condition.
47429 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47430 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
47431 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
47432 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
47433 // Invert the cond to not(cond) : xor(op,allones)=not(op)
47434 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
47435 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
47436 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
47437 }
47438
47439 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
47440 // get split by legalization.
47441 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47442 CondVT.getVectorElementType() == MVT::i1 &&
47443 TLI.isTypeLegal(VT.getScalarType())) {
47444 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
47446 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
47447 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
47448 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
47449 }
47450 }
47451
47452 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
47453 // with out-of-bounds clamping.
47454
47455 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
47456 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
47457 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
47458 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
47459 // exceeding bitwidth-1.
47460 if (N->getOpcode() == ISD::VSELECT) {
47461 using namespace llvm::SDPatternMatch;
47462 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
47463 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
47464 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
47465 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
47467 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
47469 m_SpecificCondCode(ISD::SETULT)))) {
47470 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
47471 : X86ISD::VSHLV,
47472 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
47473 }
47474 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
47475 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
47476 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
47477 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
47479 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
47481 m_SpecificCondCode(ISD::SETUGE)))) {
47482 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
47483 : X86ISD::VSHLV,
47484 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
47485 }
47486 }
47487
47488 // Early exit check
47489 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
47490 return SDValue();
47491
47492 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget))
47493 return V;
47494
47495 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
47496 return V;
47497
47498 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
47499 return V;
47500
47501 // select(~Cond, X, Y) -> select(Cond, Y, X)
47502 if (CondVT.getScalarType() != MVT::i1) {
47503 if (SDValue CondNot = IsNOT(Cond, DAG))
47504 return DAG.getNode(N->getOpcode(), DL, VT,
47505 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
47506
47507 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
47508 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
47509 Cond.getOperand(0).getOpcode() == ISD::AND &&
47510 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
47511 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
47512 Cond.getScalarValueSizeInBits(),
47513 /*AllowUndefs=*/true) &&
47514 Cond.hasOneUse()) {
47515 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
47516 Cond.getOperand(0).getOperand(1));
47517 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47518 }
47519
47520 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47521 // signbit.
47522 if (Cond.getOpcode() == X86ISD::PCMPGT &&
47523 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
47524 Cond.hasOneUse()) {
47525 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
47526 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
47527 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47528 }
47529 }
47530
47531 // Try to optimize vXi1 selects if both operands are either all constants or
47532 // bitcasts from scalar integer type. In that case we can convert the operands
47533 // to integer and use an integer select which will be converted to a CMOV.
47534 // We need to take a little bit of care to avoid creating an i64 type after
47535 // type legalization.
47536 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47537 VT.getVectorElementType() == MVT::i1 &&
47538 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
47540 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
47541 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
47542 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
47543
47544 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
47545 LHS.getOperand(0).getValueType() == IntVT)) &&
47546 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
47547 RHS.getOperand(0).getValueType() == IntVT))) {
47548 if (LHSIsConst)
47550 else
47551 LHS = LHS.getOperand(0);
47552
47553 if (RHSIsConst)
47555 else
47556 RHS = RHS.getOperand(0);
47557
47558 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
47559 return DAG.getBitcast(VT, Select);
47560 }
47561 }
47562 }
47563
47564 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
47565 // single bits, then invert the predicate and swap the select operands.
47566 // This can lower using a vector shift bit-hack rather than mask and compare.
47567 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
47568 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47569 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
47570 Cond.getOperand(0).getOpcode() == ISD::AND &&
47571 isNullOrNullSplat(Cond.getOperand(1)) &&
47572 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47573 Cond.getOperand(0).getValueType() == VT) {
47574 // The 'and' mask must be composed of power-of-2 constants.
47575 SDValue And = Cond.getOperand(0);
47576 auto *C = isConstOrConstSplat(And.getOperand(1));
47577 if (C && C->getAPIntValue().isPowerOf2()) {
47578 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47579 SDValue NotCond =
47580 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
47581 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
47582 }
47583
47584 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47585 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47586 // 16-bit lacks a proper blendv.
47587 unsigned EltBitWidth = VT.getScalarSizeInBits();
47588 bool CanShiftBlend =
47589 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
47590 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
47591 (Subtarget.hasXOP()));
47592 if (CanShiftBlend &&
47593 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
47594 return C->getAPIntValue().isPowerOf2();
47595 })) {
47596 // Create a left-shift constant to get the mask bits over to the sign-bit.
47597 SDValue Mask = And.getOperand(1);
47598 SmallVector<int, 32> ShlVals;
47599 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
47600 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
47601 ShlVals.push_back(EltBitWidth - 1 -
47602 MaskVal->getAPIntValue().exactLogBase2());
47603 }
47604 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47605 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
47606 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
47607 SDValue NewCond =
47608 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
47609 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
47610 }
47611 }
47612
47613 return SDValue();
47614}
47615
47616/// Combine:
47617/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
47618/// to:
47619/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
47620/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
47621/// Note that this is only legal for some op/cc combinations.
47623 SelectionDAG &DAG,
47624 const X86Subtarget &Subtarget) {
47625 // This combine only operates on CMP-like nodes.
47626 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47627 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47628 return SDValue();
47629
47630 // Can't replace the cmp if it has more uses than the one we're looking at.
47631 // FIXME: We would like to be able to handle this, but would need to make sure
47632 // all uses were updated.
47633 if (!Cmp.hasOneUse())
47634 return SDValue();
47635
47636 // This only applies to variations of the common case:
47637 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47638 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47639 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47640 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47641 // Using the proper condcodes (see below), overflow is checked for.
47642
47643 // FIXME: We can generalize both constraints:
47644 // - XOR/OR/AND (if they were made to survive AtomicExpand)
47645 // - LHS != 1
47646 // if the result is compared.
47647
47648 SDValue CmpLHS = Cmp.getOperand(0);
47649 SDValue CmpRHS = Cmp.getOperand(1);
47650 EVT CmpVT = CmpLHS.getValueType();
47651
47652 if (!CmpLHS.hasOneUse())
47653 return SDValue();
47654
47655 unsigned Opc = CmpLHS.getOpcode();
47656 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
47657 return SDValue();
47658
47659 SDValue OpRHS = CmpLHS.getOperand(2);
47660 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
47661 if (!OpRHSC)
47662 return SDValue();
47663
47664 APInt Addend = OpRHSC->getAPIntValue();
47665 if (Opc == ISD::ATOMIC_LOAD_SUB)
47666 Addend = -Addend;
47667
47668 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
47669 if (!CmpRHSC)
47670 return SDValue();
47671
47672 APInt Comparison = CmpRHSC->getAPIntValue();
47673 APInt NegAddend = -Addend;
47674
47675 // See if we can adjust the CC to make the comparison match the negated
47676 // addend.
47677 if (Comparison != NegAddend) {
47678 APInt IncComparison = Comparison + 1;
47679 if (IncComparison == NegAddend) {
47680 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
47681 Comparison = IncComparison;
47682 CC = X86::COND_AE;
47683 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
47684 Comparison = IncComparison;
47685 CC = X86::COND_L;
47686 }
47687 }
47688 APInt DecComparison = Comparison - 1;
47689 if (DecComparison == NegAddend) {
47690 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
47691 Comparison = DecComparison;
47692 CC = X86::COND_A;
47693 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
47694 Comparison = DecComparison;
47695 CC = X86::COND_LE;
47696 }
47697 }
47698 }
47699
47700 // If the addend is the negation of the comparison value, then we can do
47701 // a full comparison by emitting the atomic arithmetic as a locked sub.
47702 if (Comparison == NegAddend) {
47703 // The CC is fine, but we need to rewrite the LHS of the comparison as an
47704 // atomic sub.
47705 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
47706 auto AtomicSub = DAG.getAtomic(
47707 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
47708 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
47709 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
47710 AN->getMemOperand());
47711 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47712 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47713 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47714 return LockOp;
47715 }
47716
47717 // We can handle comparisons with zero in a number of cases by manipulating
47718 // the CC used.
47719 if (!Comparison.isZero())
47720 return SDValue();
47721
47722 if (CC == X86::COND_S && Addend == 1)
47723 CC = X86::COND_LE;
47724 else if (CC == X86::COND_NS && Addend == 1)
47725 CC = X86::COND_G;
47726 else if (CC == X86::COND_G && Addend == -1)
47727 CC = X86::COND_GE;
47728 else if (CC == X86::COND_LE && Addend == -1)
47729 CC = X86::COND_L;
47730 else
47731 return SDValue();
47732
47733 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47734 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47735 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47736 return LockOp;
47737}
47738
47739// Check whether we're just testing the signbit, and whether we can simplify
47740// this by tracking where the signbit came from.
47742 SelectionDAG &DAG) {
47743 if (CC != X86::COND_S && CC != X86::COND_NS)
47744 return SDValue();
47745
47746 if (!Cmp.hasOneUse())
47747 return SDValue();
47748
47749 SDValue Src;
47750 if (Cmp.getOpcode() == X86ISD::CMP) {
47751 // CMP(X,0) -> signbit test
47752 if (!isNullConstant(Cmp.getOperand(1)))
47753 return SDValue();
47754 Src = Cmp.getOperand(0);
47755 // Peek through a SRA node as we just need the signbit.
47756 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
47757 // TODO: Use SimplifyDemandedBits instead of just SRA?
47758 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
47759 return SDValue();
47760 Src = Src.getOperand(0);
47761 } else if (Cmp.getOpcode() == X86ISD::OR) {
47762 // OR(X,Y) -> see if only one operand contributes to the signbit.
47763 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
47764 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
47765 Src = Cmp.getOperand(1);
47766 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
47767 Src = Cmp.getOperand(0);
47768 else
47769 return SDValue();
47770 } else {
47771 return SDValue();
47772 }
47773
47774 // Replace with a TEST on the MSB.
47775 SDLoc DL(Cmp);
47776 MVT SrcVT = Src.getSimpleValueType();
47777 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
47778
47779 // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
47780 // peek through and adjust the TEST bit.
47781 if (Src.getOpcode() == ISD::SHL) {
47782 if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
47783 Src = Src.getOperand(0);
47784 BitMask.lshrInPlace(*ShiftAmt);
47785 }
47786 }
47787
47788 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
47789 DAG.getConstant(BitMask, DL, SrcVT));
47791 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
47792 DAG.getConstant(0, DL, SrcVT));
47793}
47794
47795// Check whether a boolean test is testing a boolean value generated by
47796// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47797// code.
47798//
47799// Simplify the following patterns:
47800// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47801// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47802// to (Op EFLAGS Cond)
47803//
47804// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47805// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47806// to (Op EFLAGS !Cond)
47807//
47808// where Op could be BRCOND or CMOV.
47809//
47811 // This combine only operates on CMP-like nodes.
47812 if (!(Cmp.getOpcode() == X86ISD::CMP ||
47813 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47814 return SDValue();
47815
47816 // Quit if not used as a boolean value.
47817 if (CC != X86::COND_E && CC != X86::COND_NE)
47818 return SDValue();
47819
47820 // Check CMP operands. One of them should be 0 or 1 and the other should be
47821 // an SetCC or extended from it.
47822 SDValue Op1 = Cmp.getOperand(0);
47823 SDValue Op2 = Cmp.getOperand(1);
47824
47825 SDValue SetCC;
47826 const ConstantSDNode* C = nullptr;
47827 bool needOppositeCond = (CC == X86::COND_E);
47828 bool checkAgainstTrue = false; // Is it a comparison against 1?
47829
47830 if ((C = dyn_cast<ConstantSDNode>(Op1)))
47831 SetCC = Op2;
47832 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47833 SetCC = Op1;
47834 else // Quit if all operands are not constants.
47835 return SDValue();
47836
47837 if (C->getZExtValue() == 1) {
47838 needOppositeCond = !needOppositeCond;
47839 checkAgainstTrue = true;
47840 } else if (C->getZExtValue() != 0)
47841 // Quit if the constant is neither 0 or 1.
47842 return SDValue();
47843
47844 bool truncatedToBoolWithAnd = false;
47845 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47846 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47847 SetCC.getOpcode() == ISD::TRUNCATE ||
47848 SetCC.getOpcode() == ISD::AND) {
47849 if (SetCC.getOpcode() == ISD::AND) {
47850 int OpIdx = -1;
47851 if (isOneConstant(SetCC.getOperand(0)))
47852 OpIdx = 1;
47853 if (isOneConstant(SetCC.getOperand(1)))
47854 OpIdx = 0;
47855 if (OpIdx < 0)
47856 break;
47857 SetCC = SetCC.getOperand(OpIdx);
47858 truncatedToBoolWithAnd = true;
47859 } else
47860 SetCC = SetCC.getOperand(0);
47861 }
47862
47863 switch (SetCC.getOpcode()) {
47865 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47866 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47867 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47868 // truncated to i1 using 'and'.
47869 if (checkAgainstTrue && !truncatedToBoolWithAnd)
47870 break;
47872 "Invalid use of SETCC_CARRY!");
47873 [[fallthrough]];
47874 case X86ISD::SETCC:
47875 // Set the condition code or opposite one if necessary.
47877 if (needOppositeCond)
47879 return SetCC.getOperand(1);
47880 case X86ISD::CMOV: {
47881 // Check whether false/true value has canonical one, i.e. 0 or 1.
47882 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47883 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47884 // Quit if true value is not a constant.
47885 if (!TVal)
47886 return SDValue();
47887 // Quit if false value is not a constant.
47888 if (!FVal) {
47889 SDValue Op = SetCC.getOperand(0);
47890 // Skip 'zext' or 'trunc' node.
47891 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47892 Op.getOpcode() == ISD::TRUNCATE)
47893 Op = Op.getOperand(0);
47894 // A special case for rdrand/rdseed, where 0 is set if false cond is
47895 // found.
47896 if ((Op.getOpcode() != X86ISD::RDRAND &&
47897 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47898 return SDValue();
47899 }
47900 // Quit if false value is not the constant 0 or 1.
47901 bool FValIsFalse = true;
47902 if (FVal && FVal->getZExtValue() != 0) {
47903 if (FVal->getZExtValue() != 1)
47904 return SDValue();
47905 // If FVal is 1, opposite cond is needed.
47906 needOppositeCond = !needOppositeCond;
47907 FValIsFalse = false;
47908 }
47909 // Quit if TVal is not the constant opposite of FVal.
47910 if (FValIsFalse && TVal->getZExtValue() != 1)
47911 return SDValue();
47912 if (!FValIsFalse && TVal->getZExtValue() != 0)
47913 return SDValue();
47915 if (needOppositeCond)
47917 return SetCC.getOperand(3);
47918 }
47919 }
47920
47921 return SDValue();
47922}
47923
47924/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47925/// Match:
47926/// (X86or (X86setcc) (X86setcc))
47927/// (X86cmp (and (X86setcc) (X86setcc)), 0)
47929 X86::CondCode &CC1, SDValue &Flags,
47930 bool &isAnd) {
47931 if (Cond->getOpcode() == X86ISD::CMP) {
47932 if (!isNullConstant(Cond->getOperand(1)))
47933 return false;
47934
47935 Cond = Cond->getOperand(0);
47936 }
47937
47938 isAnd = false;
47939
47940 SDValue SetCC0, SetCC1;
47941 switch (Cond->getOpcode()) {
47942 default: return false;
47943 case ISD::AND:
47944 case X86ISD::AND:
47945 isAnd = true;
47946 [[fallthrough]];
47947 case ISD::OR:
47948 case X86ISD::OR:
47949 SetCC0 = Cond->getOperand(0);
47950 SetCC1 = Cond->getOperand(1);
47951 break;
47952 };
47953
47954 // Make sure we have SETCC nodes, using the same flags value.
47955 if (SetCC0.getOpcode() != X86ISD::SETCC ||
47956 SetCC1.getOpcode() != X86ISD::SETCC ||
47957 SetCC0->getOperand(1) != SetCC1->getOperand(1))
47958 return false;
47959
47960 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47961 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47962 Flags = SetCC0->getOperand(1);
47963 return true;
47964}
47965
47966// When legalizing carry, we create carries via add X, -1
47967// If that comes from an actual carry, via setcc, we use the
47968// carry directly.
47970 if (EFLAGS.getOpcode() == X86ISD::ADD) {
47971 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
47972 bool FoundAndLSB = false;
47973 SDValue Carry = EFLAGS.getOperand(0);
47974 while (Carry.getOpcode() == ISD::TRUNCATE ||
47975 Carry.getOpcode() == ISD::ZERO_EXTEND ||
47976 (Carry.getOpcode() == ISD::AND &&
47977 isOneConstant(Carry.getOperand(1)))) {
47978 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
47979 Carry = Carry.getOperand(0);
47980 }
47981 if (Carry.getOpcode() == X86ISD::SETCC ||
47982 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
47983 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
47984 uint64_t CarryCC = Carry.getConstantOperandVal(0);
47985 SDValue CarryOp1 = Carry.getOperand(1);
47986 if (CarryCC == X86::COND_B)
47987 return CarryOp1;
47988 if (CarryCC == X86::COND_A) {
47989 // Try to convert COND_A into COND_B in an attempt to facilitate
47990 // materializing "setb reg".
47991 //
47992 // Do not flip "e > c", where "c" is a constant, because Cmp
47993 // instruction cannot take an immediate as its first operand.
47994 //
47995 if (CarryOp1.getOpcode() == X86ISD::SUB &&
47996 CarryOp1.getNode()->hasOneUse() &&
47997 CarryOp1.getValueType().isInteger() &&
47998 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
47999 SDValue SubCommute =
48000 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48001 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48002 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48003 }
48004 }
48005 // If this is a check of the z flag of an add with 1, switch to the
48006 // C flag.
48007 if (CarryCC == X86::COND_E &&
48008 CarryOp1.getOpcode() == X86ISD::ADD &&
48009 isOneConstant(CarryOp1.getOperand(1)))
48010 return CarryOp1;
48011 } else if (FoundAndLSB) {
48012 SDLoc DL(Carry);
48013 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48014 if (Carry.getOpcode() == ISD::SRL) {
48015 BitNo = Carry.getOperand(1);
48016 Carry = Carry.getOperand(0);
48017 }
48018 return getBT(Carry, BitNo, DL, DAG);
48019 }
48020 }
48021 }
48022
48023 return SDValue();
48024}
48025
48026/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48027/// to avoid the inversion.
48029 SelectionDAG &DAG,
48030 const X86Subtarget &Subtarget) {
48031 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48032 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48033 EFLAGS.getOpcode() != X86ISD::TESTP)
48034 return SDValue();
48035
48036 // PTEST/TESTP sets EFLAGS as:
48037 // TESTZ: ZF = (Op0 & Op1) == 0
48038 // TESTC: CF = (~Op0 & Op1) == 0
48039 // TESTNZC: ZF == 0 && CF == 0
48040 MVT VT = EFLAGS.getSimpleValueType();
48041 SDValue Op0 = EFLAGS.getOperand(0);
48042 SDValue Op1 = EFLAGS.getOperand(1);
48043 MVT OpVT = Op0.getSimpleValueType();
48044 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48045
48046 // TEST*(~X,Y) == TEST*(X,Y)
48047 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48048 X86::CondCode InvCC;
48049 switch (CC) {
48050 case X86::COND_B:
48051 // testc -> testz.
48052 InvCC = X86::COND_E;
48053 break;
48054 case X86::COND_AE:
48055 // !testc -> !testz.
48056 InvCC = X86::COND_NE;
48057 break;
48058 case X86::COND_E:
48059 // testz -> testc.
48060 InvCC = X86::COND_B;
48061 break;
48062 case X86::COND_NE:
48063 // !testz -> !testc.
48064 InvCC = X86::COND_AE;
48065 break;
48066 case X86::COND_A:
48067 case X86::COND_BE:
48068 // testnzc -> testnzc (no change).
48069 InvCC = CC;
48070 break;
48071 default:
48072 InvCC = X86::COND_INVALID;
48073 break;
48074 }
48075
48076 if (InvCC != X86::COND_INVALID) {
48077 CC = InvCC;
48078 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48079 DAG.getBitcast(OpVT, NotOp0), Op1);
48080 }
48081 }
48082
48083 if (CC == X86::COND_B || CC == X86::COND_AE) {
48084 // TESTC(X,~X) == TESTC(X,-1)
48085 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48086 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48087 SDLoc DL(EFLAGS);
48088 return DAG.getNode(
48089 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48090 DAG.getBitcast(OpVT,
48091 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48092 }
48093 }
48094 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48095 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48097 SDValue BC0 = peekThroughBitcasts(Op0);
48098 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48100 SDLoc DL(EFLAGS);
48102 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48103 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48104 }
48105 }
48106 }
48107
48108 if (CC == X86::COND_E || CC == X86::COND_NE) {
48109 // TESTZ(X,~Y) == TESTC(Y,X)
48110 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48112 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48113 DAG.getBitcast(OpVT, NotOp1), Op0);
48114 }
48115
48116 if (Op0 == Op1) {
48117 SDValue BC = peekThroughBitcasts(Op0);
48118 EVT BCVT = BC.getValueType();
48119
48120 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48121 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48122 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48123 DAG.getBitcast(OpVT, BC.getOperand(0)),
48124 DAG.getBitcast(OpVT, BC.getOperand(1)));
48125 }
48126
48127 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48128 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48130 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48131 DAG.getBitcast(OpVT, BC.getOperand(0)),
48132 DAG.getBitcast(OpVT, BC.getOperand(1)));
48133 }
48134
48135 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48136 // to more efficiently extract the sign bits and compare that.
48137 // TODO: Handle TESTC with comparison inversion.
48138 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48139 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48140 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48141 unsigned EltBits = BCVT.getScalarSizeInBits();
48142 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48143 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48144 APInt SignMask = APInt::getSignMask(EltBits);
48145 if (SDValue Res =
48146 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48147 // For vXi16 cases we need to use pmovmksb and extract every other
48148 // sign bit.
48149 SDLoc DL(EFLAGS);
48150 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48151 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48152 MVT FloatVT =
48153 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48154 Res = DAG.getBitcast(FloatVT, Res);
48155 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48156 } else if (EltBits == 16) {
48157 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48158 Res = DAG.getBitcast(MovmskVT, Res);
48159 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48160 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48161 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48162 } else {
48163 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48164 }
48165 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48166 DAG.getConstant(0, DL, MVT::i32));
48167 }
48168 }
48169 }
48170 }
48171
48172 // TESTZ(-1,X) == TESTZ(X,X)
48174 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48175
48176 // TESTZ(X,-1) == TESTZ(X,X)
48178 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48179
48180 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48181 // TODO: Add COND_NE handling?
48182 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48183 SDValue Src0 = peekThroughBitcasts(Op0);
48184 SDValue Src1 = peekThroughBitcasts(Op1);
48185 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48187 peekThroughBitcasts(Src0.getOperand(1)), true);
48189 peekThroughBitcasts(Src1.getOperand(1)), true);
48190 if (Src0 && Src1) {
48191 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48192 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48193 DAG.getBitcast(OpVT2, Src0),
48194 DAG.getBitcast(OpVT2, Src1));
48195 }
48196 }
48197 }
48198 }
48199
48200 return SDValue();
48201}
48202
48203// Attempt to simplify the MOVMSK input based on the comparison type.
48205 SelectionDAG &DAG,
48206 const X86Subtarget &Subtarget) {
48207 // Handle eq/ne against zero (any_of).
48208 // Handle eq/ne against -1 (all_of).
48209 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48210 return SDValue();
48211 if (EFLAGS.getValueType() != MVT::i32)
48212 return SDValue();
48213 unsigned CmpOpcode = EFLAGS.getOpcode();
48214 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48215 return SDValue();
48216 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48217 if (!CmpConstant)
48218 return SDValue();
48219 const APInt &CmpVal = CmpConstant->getAPIntValue();
48220
48221 SDValue CmpOp = EFLAGS.getOperand(0);
48222 unsigned CmpBits = CmpOp.getValueSizeInBits();
48223 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48224
48225 // Peek through any truncate.
48226 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48227 CmpOp = CmpOp.getOperand(0);
48228
48229 // Bail if we don't find a MOVMSK.
48230 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48231 return SDValue();
48232
48233 SDValue Vec = CmpOp.getOperand(0);
48234 MVT VecVT = Vec.getSimpleValueType();
48235 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48236 "Unexpected MOVMSK operand");
48237 unsigned NumElts = VecVT.getVectorNumElements();
48238 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48239
48240 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48241 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48242 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48243 if (!IsAnyOf && !IsAllOf)
48244 return SDValue();
48245
48246 // TODO: Check more combining cases for me.
48247 // Here we check the cmp use number to decide do combining or not.
48248 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48249 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48250 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48251
48252 // See if we can peek through to a vector with a wider element type, if the
48253 // signbits extend down to all the sub-elements as well.
48254 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48255 // potential SimplifyDemandedBits/Elts cases.
48256 // If we looked through a truncate that discard bits, we can't do this
48257 // transform.
48258 // FIXME: We could do this transform for truncates that discarded bits by
48259 // inserting an AND mask between the new MOVMSK and the CMP.
48260 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48261 SDValue BC = peekThroughBitcasts(Vec);
48262 MVT BCVT = BC.getSimpleValueType();
48263 unsigned BCNumElts = BCVT.getVectorNumElements();
48264 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48265 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48266 BCNumEltBits > NumEltBits &&
48267 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48268 SDLoc DL(EFLAGS);
48269 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48270 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48271 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48272 DAG.getConstant(CmpMask, DL, MVT::i32));
48273 }
48274 }
48275
48276 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48277 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48278 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48279 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48280 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48282 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48283 Ops.size() == 2) {
48284 SDLoc DL(EFLAGS);
48285 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48286 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48287 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48288 DAG.getBitcast(SubVT, Ops[0]),
48289 DAG.getBitcast(SubVT, Ops[1]));
48290 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48291 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48292 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48293 DAG.getConstant(CmpMask, DL, MVT::i32));
48294 }
48295 }
48296
48297 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48298 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48299 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48300 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48301 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48302 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48303 SDValue BC = peekThroughBitcasts(Vec);
48304 // Ensure MOVMSK was testing every signbit of BC.
48305 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48306 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48307 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48308 BC.getOperand(0), BC.getOperand(1));
48309 V = DAG.getBitcast(TestVT, V);
48310 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48311 }
48312 // Check for 256-bit split vector cases.
48313 if (BC.getOpcode() == ISD::AND &&
48314 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48315 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48316 SDValue LHS = BC.getOperand(0);
48317 SDValue RHS = BC.getOperand(1);
48318 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48319 LHS.getOperand(0), LHS.getOperand(1));
48320 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48321 RHS.getOperand(0), RHS.getOperand(1));
48322 LHS = DAG.getBitcast(TestVT, LHS);
48323 RHS = DAG.getBitcast(TestVT, RHS);
48324 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48325 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48326 }
48327 }
48328 }
48329
48330 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48331 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48332 // sign bits prior to the comparison with zero unless we know that
48333 // the vXi16 splats the sign bit down to the lower i8 half.
48334 // TODO: Handle all_of patterns.
48335 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48336 SDValue VecOp0 = Vec.getOperand(0);
48337 SDValue VecOp1 = Vec.getOperand(1);
48338 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48339 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48340 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48341 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48342 SDLoc DL(EFLAGS);
48343 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48344 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48345 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48346 if (!SignExt0) {
48347 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48348 DAG.getConstant(0xAAAA, DL, MVT::i16));
48349 }
48350 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48351 DAG.getConstant(0, DL, MVT::i16));
48352 }
48353 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48354 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48355 if (CmpBits >= 16 && Subtarget.hasInt256() &&
48356 (IsAnyOf || (SignExt0 && SignExt1))) {
48357 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48358 SDLoc DL(EFLAGS);
48359 SDValue Result = peekThroughBitcasts(Src);
48360 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48361 Result.getValueType().getVectorNumElements() <= NumElts) {
48362 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48363 Result.getOperand(0), Result.getOperand(1));
48364 V = DAG.getBitcast(MVT::v4i64, V);
48365 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48366 }
48367 Result = DAG.getBitcast(MVT::v32i8, Result);
48368 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48369 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
48370 if (!SignExt0 || !SignExt1) {
48371 assert(IsAnyOf &&
48372 "Only perform v16i16 signmasks for any_of patterns");
48373 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
48374 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48375 }
48376 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48377 DAG.getConstant(CmpMask, DL, MVT::i32));
48378 }
48379 }
48380 }
48381
48382 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
48383 // Since we peek through a bitcast, we need to be careful if the base vector
48384 // type has smaller elements than the MOVMSK type. In that case, even if
48385 // all the elements are demanded by the shuffle mask, only the "high"
48386 // elements which have highbits that align with highbits in the MOVMSK vec
48387 // elements are actually demanded. A simplification of spurious operations
48388 // on the "low" elements take place during other simplifications.
48389 //
48390 // For example:
48391 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
48392 // demanded, because we are swapping around the result can change.
48393 //
48394 // To address this, we check that we can scale the shuffle mask to MOVMSK
48395 // element width (this will ensure "high" elements match). Its slightly overly
48396 // conservative, but fine for an edge case fold.
48397 SmallVector<int, 32> ShuffleMask;
48398 SmallVector<SDValue, 2> ShuffleInputs;
48399 if (NumElts <= CmpBits &&
48400 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
48401 ShuffleMask, DAG) &&
48402 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
48403 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
48404 canScaleShuffleElements(ShuffleMask, NumElts)) {
48405 SDLoc DL(EFLAGS);
48406 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
48407 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48408 Result =
48409 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
48410 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
48411 }
48412
48413 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
48414 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
48415 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
48416 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
48417 // iff every element is referenced.
48418 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
48419 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
48420 (NumEltBits == 32 || NumEltBits == 64)) {
48421 SDLoc DL(EFLAGS);
48422 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
48423 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
48424 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
48425 SDValue LHS = Vec;
48426 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
48427 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48428 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
48429 DAG.getBitcast(FloatVT, LHS),
48430 DAG.getBitcast(FloatVT, RHS));
48431 }
48432
48433 return SDValue();
48434}
48435
48436/// Optimize an EFLAGS definition used according to the condition code \p CC
48437/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
48438/// uses of chain values.
48440 SelectionDAG &DAG,
48441 const X86Subtarget &Subtarget) {
48442 if (CC == X86::COND_B)
48443 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
48444 return Flags;
48445
48446 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
48447 return R;
48448
48449 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
48450 return R;
48451
48452 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
48453 return R;
48454
48455 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
48456 return R;
48457
48458 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
48459}
48460
48461/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
48464 const X86Subtarget &Subtarget) {
48465 SDLoc DL(N);
48466
48467 SDValue FalseOp = N->getOperand(0);
48468 SDValue TrueOp = N->getOperand(1);
48469 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
48470 SDValue Cond = N->getOperand(3);
48471
48472 // cmov X, X, ?, ? --> X
48473 if (TrueOp == FalseOp)
48474 return TrueOp;
48475
48476 // Try to simplify the EFLAGS and condition code operands.
48477 // We can't always do this as FCMOV only supports a subset of X86 cond.
48478 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
48479 if (!(FalseOp.getValueType() == MVT::f80 ||
48480 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
48481 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
48482 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
48483 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
48484 Flags};
48485 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48486 }
48487 }
48488
48489 // If this is a select between two integer constants, try to do some
48490 // optimizations. Note that the operands are ordered the opposite of SELECT
48491 // operands.
48492 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
48493 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
48494 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
48495 // larger than FalseC (the false value).
48496 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
48498 std::swap(TrueC, FalseC);
48499 std::swap(TrueOp, FalseOp);
48500 }
48501
48502 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
48503 // This is efficient for any integer data type (including i8/i16) and
48504 // shift amount.
48505 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
48506 Cond = getSETCC(CC, Cond, DL, DAG);
48507
48508 // Zero extend the condition if needed.
48509 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48510
48511 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48512 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
48513 DAG.getConstant(ShAmt, DL, MVT::i8));
48514 return Cond;
48515 }
48516
48517 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
48518 // for any integer data type, including i8/i16.
48519 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48520 Cond = getSETCC(CC, Cond, DL, DAG);
48521
48522 // Zero extend the condition if needed.
48524 FalseC->getValueType(0), Cond);
48525 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48526 SDValue(FalseC, 0));
48527 return Cond;
48528 }
48529
48530 // Optimize cases that will turn into an LEA instruction. This requires
48531 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
48532 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
48533 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48534 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
48535 "Implicit constant truncation");
48536
48537 bool isFastMultiplier = false;
48538 if (Diff.ult(10)) {
48539 switch (Diff.getZExtValue()) {
48540 default: break;
48541 case 1: // result = add base, cond
48542 case 2: // result = lea base( , cond*2)
48543 case 3: // result = lea base(cond, cond*2)
48544 case 4: // result = lea base( , cond*4)
48545 case 5: // result = lea base(cond, cond*4)
48546 case 8: // result = lea base( , cond*8)
48547 case 9: // result = lea base(cond, cond*8)
48548 isFastMultiplier = true;
48549 break;
48550 }
48551 }
48552
48553 if (isFastMultiplier) {
48554 Cond = getSETCC(CC, Cond, DL ,DAG);
48555 // Zero extend the condition if needed.
48556 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48557 Cond);
48558 // Scale the condition by the difference.
48559 if (Diff != 1)
48560 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
48561 DAG.getConstant(Diff, DL, Cond.getValueType()));
48562
48563 // Add the base if non-zero.
48564 if (FalseC->getAPIntValue() != 0)
48565 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48566 SDValue(FalseC, 0));
48567 return Cond;
48568 }
48569 }
48570 }
48571 }
48572
48573 // Handle these cases:
48574 // (select (x != c), e, c) -> select (x != c), e, x),
48575 // (select (x == c), c, e) -> select (x == c), x, e)
48576 // where the c is an integer constant, and the "select" is the combination
48577 // of CMOV and CMP.
48578 //
48579 // The rationale for this change is that the conditional-move from a constant
48580 // needs two instructions, however, conditional-move from a register needs
48581 // only one instruction.
48582 //
48583 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
48584 // some instruction-combining opportunities. This opt needs to be
48585 // postponed as late as possible.
48586 //
48587 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
48588 // the DCI.xxxx conditions are provided to postpone the optimization as
48589 // late as possible.
48590
48591 ConstantSDNode *CmpAgainst = nullptr;
48592 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
48593 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
48594 !isa<ConstantSDNode>(Cond.getOperand(0))) {
48595
48596 if (CC == X86::COND_NE &&
48597 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
48599 std::swap(TrueOp, FalseOp);
48600 }
48601
48602 if (CC == X86::COND_E &&
48603 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
48604 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
48605 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
48606 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48607 }
48608 }
48609 }
48610
48611 // Transform:
48612 //
48613 // (cmov 1 T (uge T 2))
48614 //
48615 // to:
48616 //
48617 // (adc T 0 (sub T 1))
48618 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
48619 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48620 SDValue Cond0 = Cond.getOperand(0);
48621 if (Cond0.getOpcode() == ISD::TRUNCATE)
48622 Cond0 = Cond0.getOperand(0);
48623 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
48624 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48625 EVT CondVT = Cond->getValueType(0);
48626 EVT OuterVT = N->getValueType(0);
48627 // Subtract 1 and generate a carry.
48628 SDValue NewSub =
48629 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48630 DAG.getConstant(1, DL, CondVT));
48631 SDValue EFLAGS(NewSub.getNode(), 1);
48632 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
48633 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
48634 }
48635 }
48636
48637 // Fold and/or of setcc's to double CMOV:
48638 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48639 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48640 //
48641 // This combine lets us generate:
48642 // cmovcc1 (jcc1 if we don't have CMOV)
48643 // cmovcc2 (same)
48644 // instead of:
48645 // setcc1
48646 // setcc2
48647 // and/or
48648 // cmovne (jne if we don't have CMOV)
48649 // When we can't use the CMOV instruction, it might increase branch
48650 // mispredicts.
48651 // When we can use CMOV, or when there is no mispredict, this improves
48652 // throughput and reduces register pressure.
48653 //
48654 if (CC == X86::COND_NE) {
48655 SDValue Flags;
48656 X86::CondCode CC0, CC1;
48657 bool isAndSetCC;
48658 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
48659 if (isAndSetCC) {
48660 std::swap(FalseOp, TrueOp);
48663 }
48664
48665 SDValue LOps[] = {FalseOp, TrueOp,
48666 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
48667 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
48668 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
48669 Flags};
48670 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48671 return CMOV;
48672 }
48673 }
48674
48675 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48676 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48677 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48678 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48679 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
48680 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
48681 SDValue Add = TrueOp;
48682 SDValue Const = FalseOp;
48683 // Canonicalize the condition code for easier matching and output.
48684 if (CC == X86::COND_E)
48685 std::swap(Add, Const);
48686
48687 // We might have replaced the constant in the cmov with the LHS of the
48688 // compare. If so change it to the RHS of the compare.
48689 if (Const == Cond.getOperand(0))
48690 Const = Cond.getOperand(1);
48691
48692 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
48693 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
48694 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
48695 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
48696 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
48697 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
48698 EVT VT = N->getValueType(0);
48699 // This should constant fold.
48700 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
48701 SDValue CMov =
48702 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
48703 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
48704 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
48705 }
48706 }
48707
48708 return SDValue();
48709}
48710
48711/// Different mul shrinking modes.
48713
48715 EVT VT = N->getOperand(0).getValueType();
48716 if (VT.getScalarSizeInBits() != 32)
48717 return false;
48718
48719 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
48720 unsigned SignBits[2] = {1, 1};
48721 bool IsPositive[2] = {false, false};
48722 for (unsigned i = 0; i < 2; i++) {
48723 SDValue Opd = N->getOperand(i);
48724
48725 SignBits[i] = DAG.ComputeNumSignBits(Opd);
48726 IsPositive[i] = DAG.SignBitIsZero(Opd);
48727 }
48728
48729 bool AllPositive = IsPositive[0] && IsPositive[1];
48730 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
48731 // When ranges are from -128 ~ 127, use MULS8 mode.
48732 if (MinSignBits >= 25)
48733 Mode = ShrinkMode::MULS8;
48734 // When ranges are from 0 ~ 255, use MULU8 mode.
48735 else if (AllPositive && MinSignBits >= 24)
48736 Mode = ShrinkMode::MULU8;
48737 // When ranges are from -32768 ~ 32767, use MULS16 mode.
48738 else if (MinSignBits >= 17)
48739 Mode = ShrinkMode::MULS16;
48740 // When ranges are from 0 ~ 65535, use MULU16 mode.
48741 else if (AllPositive && MinSignBits >= 16)
48742 Mode = ShrinkMode::MULU16;
48743 else
48744 return false;
48745 return true;
48746}
48747
48748/// When the operands of vector mul are extended from smaller size values,
48749/// like i8 and i16, the type of mul may be shrinked to generate more
48750/// efficient code. Two typical patterns are handled:
48751/// Pattern1:
48752/// %2 = sext/zext <N x i8> %1 to <N x i32>
48753/// %4 = sext/zext <N x i8> %3 to <N x i32>
48754// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48755/// %5 = mul <N x i32> %2, %4
48756///
48757/// Pattern2:
48758/// %2 = zext/sext <N x i16> %1 to <N x i32>
48759/// %4 = zext/sext <N x i16> %3 to <N x i32>
48760/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48761/// %5 = mul <N x i32> %2, %4
48762///
48763/// There are four mul shrinking modes:
48764/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
48765/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48766/// generate pmullw+sext32 for it (MULS8 mode).
48767/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
48768/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
48769/// generate pmullw+zext32 for it (MULU8 mode).
48770/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
48771/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48772/// generate pmullw+pmulhw for it (MULS16 mode).
48773/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
48774/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
48775/// generate pmullw+pmulhuw for it (MULU16 mode).
48777 const X86Subtarget &Subtarget) {
48778 // Check for legality
48779 // pmullw/pmulhw are not supported by SSE.
48780 if (!Subtarget.hasSSE2())
48781 return SDValue();
48782
48783 // Check for profitability
48784 // pmulld is supported since SSE41. It is better to use pmulld
48785 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
48786 // the expansion.
48787 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
48788 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
48789 return SDValue();
48790
48791 ShrinkMode Mode;
48792 if (!canReduceVMulWidth(N, DAG, Mode))
48793 return SDValue();
48794
48795 SDValue N0 = N->getOperand(0);
48796 SDValue N1 = N->getOperand(1);
48797 EVT VT = N->getOperand(0).getValueType();
48798 unsigned NumElts = VT.getVectorNumElements();
48799 if ((NumElts % 2) != 0)
48800 return SDValue();
48801
48802 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
48803
48804 // Shrink the operands of mul.
48805 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
48806 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
48807
48808 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
48809 // lower part is needed.
48810 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48811 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48812 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48814 DL, VT, MulLo);
48815
48816 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48817 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48818 // the higher part is also needed.
48819 SDValue MulHi =
48820 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48821 ReducedVT, NewN0, NewN1);
48822
48823 // Repack the lower part and higher part result of mul into a wider
48824 // result.
48825 // Generate shuffle functioning as punpcklwd.
48826 SmallVector<int, 16> ShuffleMask(NumElts);
48827 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48828 ShuffleMask[2 * i] = i;
48829 ShuffleMask[2 * i + 1] = i + NumElts;
48830 }
48831 SDValue ResLo =
48832 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48833 ResLo = DAG.getBitcast(ResVT, ResLo);
48834 // Generate shuffle functioning as punpckhwd.
48835 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48836 ShuffleMask[2 * i] = i + NumElts / 2;
48837 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48838 }
48839 SDValue ResHi =
48840 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48841 ResHi = DAG.getBitcast(ResVT, ResHi);
48842 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48843}
48844
48846 EVT VT, const SDLoc &DL) {
48847
48848 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48849 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48850 DAG.getConstant(Mult, DL, VT));
48851 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48852 DAG.getConstant(Shift, DL, MVT::i8));
48853 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48854 N->getOperand(0));
48855 return Result;
48856 };
48857
48858 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48859 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48860 DAG.getConstant(Mul1, DL, VT));
48861 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48862 DAG.getConstant(Mul2, DL, VT));
48863 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48864 N->getOperand(0));
48865 return Result;
48866 };
48867
48868 switch (MulAmt) {
48869 default:
48870 break;
48871 case 11:
48872 // mul x, 11 => add ((shl (mul x, 5), 1), x)
48873 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48874 case 21:
48875 // mul x, 21 => add ((shl (mul x, 5), 2), x)
48876 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48877 case 41:
48878 // mul x, 41 => add ((shl (mul x, 5), 3), x)
48879 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48880 case 22:
48881 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48882 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48883 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48884 case 19:
48885 // mul x, 19 => add ((shl (mul x, 9), 1), x)
48886 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48887 case 37:
48888 // mul x, 37 => add ((shl (mul x, 9), 2), x)
48889 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48890 case 73:
48891 // mul x, 73 => add ((shl (mul x, 9), 3), x)
48892 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48893 case 13:
48894 // mul x, 13 => add ((shl (mul x, 3), 2), x)
48895 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48896 case 23:
48897 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48898 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48899 case 26:
48900 // mul x, 26 => add ((mul (mul x, 5), 5), x)
48901 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48902 case 28:
48903 // mul x, 28 => add ((mul (mul x, 9), 3), x)
48904 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48905 case 29:
48906 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48907 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48908 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48909 }
48910
48911 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48912 // by a single LEA.
48913 // First check if this a sum of two power of 2s because that's easy. Then
48914 // count how many zeros are up to the first bit.
48915 // TODO: We can do this even without LEA at a cost of two shifts and an add.
48916 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48917 unsigned ScaleShift = llvm::countr_zero(MulAmt);
48918 if (ScaleShift >= 1 && ScaleShift < 4) {
48919 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48920 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48921 DAG.getConstant(ShiftAmt, DL, MVT::i8));
48922 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48923 DAG.getConstant(ScaleShift, DL, MVT::i8));
48924 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48925 }
48926 }
48927
48928 return SDValue();
48929}
48930
48931// If the upper 17 bits of either element are zero and the other element are
48932// zero/sign bits then we can use PMADDWD, which is always at least as quick as
48933// PMULLD, except on KNL.
48935 SelectionDAG &DAG,
48936 const X86Subtarget &Subtarget) {
48937 if (!Subtarget.hasSSE2())
48938 return SDValue();
48939
48940 if (Subtarget.isPMADDWDSlow())
48941 return SDValue();
48942
48943 EVT VT = N->getValueType(0);
48944
48945 // Only support vXi32 vectors.
48946 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
48947 return SDValue();
48948
48949 // Make sure the type is legal or can split/widen to a legal type.
48950 // With AVX512 but without BWI, we would need to split v32i16.
48951 unsigned NumElts = VT.getVectorNumElements();
48952 if (NumElts == 1 || !isPowerOf2_32(NumElts))
48953 return SDValue();
48954
48955 // With AVX512 but without BWI, we would need to split v32i16.
48956 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
48957 return SDValue();
48958
48959 SDValue N0 = N->getOperand(0);
48960 SDValue N1 = N->getOperand(1);
48961
48962 // If we are zero/sign extending two steps without SSE4.1, its better to
48963 // reduce the vmul width instead.
48964 if (!Subtarget.hasSSE41() &&
48965 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
48966 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48967 (N1.getOpcode() == ISD::ZERO_EXTEND &&
48968 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
48969 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
48970 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48971 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48972 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
48973 return SDValue();
48974
48975 // If we are sign extending a wide vector without SSE4.1, its better to reduce
48976 // the vmul width instead.
48977 if (!Subtarget.hasSSE41() &&
48978 (N0.getOpcode() == ISD::SIGN_EXTEND &&
48979 N0.getOperand(0).getValueSizeInBits() > 128) &&
48980 (N1.getOpcode() == ISD::SIGN_EXTEND &&
48981 N1.getOperand(0).getValueSizeInBits() > 128))
48982 return SDValue();
48983
48984 // Sign bits must extend down to the lowest i16.
48985 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
48986 DAG.ComputeMaxSignificantBits(N0) > 16)
48987 return SDValue();
48988
48989 // At least one of the elements must be zero in the upper 17 bits, or can be
48990 // safely made zero without altering the final result.
48991 auto GetZeroableOp = [&](SDValue Op) {
48992 APInt Mask17 = APInt::getHighBitsSet(32, 17);
48993 if (DAG.MaskedValueIsZero(Op, Mask17))
48994 return Op;
48995 // Mask off upper 16-bits of sign-extended constants.
48997 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
48998 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
48999 SDValue Src = Op.getOperand(0);
49000 // Convert sext(vXi16) to zext(vXi16).
49001 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49002 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49003 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49004 // which will expand the extension.
49005 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49006 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49007 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49008 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49009 }
49010 }
49011 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49012 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49013 N->isOnlyUserOf(Op.getNode())) {
49014 SDValue Src = Op.getOperand(0);
49015 if (Src.getScalarValueSizeInBits() == 16)
49016 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49017 }
49018 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49019 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49020 N->isOnlyUserOf(Op.getNode())) {
49021 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49022 Op.getOperand(1));
49023 }
49024 return SDValue();
49025 };
49026 SDValue ZeroN0 = GetZeroableOp(N0);
49027 SDValue ZeroN1 = GetZeroableOp(N1);
49028 if (!ZeroN0 && !ZeroN1)
49029 return SDValue();
49030 N0 = ZeroN0 ? ZeroN0 : N0;
49031 N1 = ZeroN1 ? ZeroN1 : N1;
49032
49033 // Use SplitOpsAndApply to handle AVX splitting.
49034 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49035 ArrayRef<SDValue> Ops) {
49036 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49037 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49038 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49039 DAG.getBitcast(OpVT, Ops[0]),
49040 DAG.getBitcast(OpVT, Ops[1]));
49041 };
49042 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49043}
49044
49046 const X86Subtarget &Subtarget) {
49047 if (!Subtarget.hasSSE2())
49048 return SDValue();
49049
49050 EVT VT = N->getValueType(0);
49051
49052 // Only support vXi64 vectors.
49053 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49054 VT.getVectorNumElements() < 2 ||
49056 return SDValue();
49057
49058 SDValue N0 = N->getOperand(0);
49059 SDValue N1 = N->getOperand(1);
49060
49061 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49062 // 32-bits. We can lower with this if the sign bits stretch that far.
49063 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49064 DAG.ComputeNumSignBits(N1) > 32) {
49065 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49066 ArrayRef<SDValue> Ops) {
49067 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49068 };
49069 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49070 /*CheckBWI*/ false);
49071 }
49072
49073 // If the upper bits are zero we can use a single pmuludq.
49074 APInt Mask = APInt::getHighBitsSet(64, 32);
49075 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49076 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49077 ArrayRef<SDValue> Ops) {
49078 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49079 };
49080 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49081 /*CheckBWI*/ false);
49082 }
49083
49084 return SDValue();
49085}
49086
49089 const X86Subtarget &Subtarget) {
49090 EVT VT = N->getValueType(0);
49091 SDLoc DL(N);
49092
49093 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49094 return V;
49095
49096 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49097 return V;
49098
49099 if (DCI.isBeforeLegalize() && VT.isVector())
49100 return reduceVMULWidth(N, DL, DAG, Subtarget);
49101
49102 if (VT != MVT::i64 && VT != MVT::i32 &&
49103 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49104 return SDValue();
49105
49106 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49107 if (!Known1.isConstant())
49108 return SDValue();
49109
49110 const APInt &C = Known1.getConstant();
49111 if (C.isZero())
49112 return DAG.getConstant(0, DL, VT);
49113
49114 if (C.isAllOnes())
49115 return DAG.getNegative(N->getOperand(0), DL, VT);
49116
49117 if (isPowerOf2_64(C.getZExtValue()))
49118 return SDValue();
49119
49120 // Optimize a single multiply with constant into two operations in order to
49121 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49123 return SDValue();
49124
49125 // An imul is usually smaller than the alternative sequence.
49127 return SDValue();
49128
49129 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49130 return SDValue();
49131
49132 int64_t SignMulAmt = C.getSExtValue();
49133 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49134 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49135
49136 SDValue NewMul = SDValue();
49137 if (VT == MVT::i64 || VT == MVT::i32) {
49138 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49139 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49140 DAG.getConstant(AbsMulAmt, DL, VT));
49141 if (SignMulAmt < 0)
49142 NewMul = DAG.getNegative(NewMul, DL, VT);
49143
49144 return NewMul;
49145 }
49146
49147 uint64_t MulAmt1 = 0;
49148 uint64_t MulAmt2 = 0;
49149 if ((AbsMulAmt % 9) == 0) {
49150 MulAmt1 = 9;
49151 MulAmt2 = AbsMulAmt / 9;
49152 } else if ((AbsMulAmt % 5) == 0) {
49153 MulAmt1 = 5;
49154 MulAmt2 = AbsMulAmt / 5;
49155 } else if ((AbsMulAmt % 3) == 0) {
49156 MulAmt1 = 3;
49157 MulAmt2 = AbsMulAmt / 3;
49158 }
49159
49160 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49161 if (MulAmt2 &&
49162 (isPowerOf2_64(MulAmt2) ||
49163 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49164
49165 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49166 N->user_begin()->getOpcode() == ISD::ADD))
49167 // If second multiplifer is pow2, issue it first. We want the multiply
49168 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49169 // use is an add. Only do this for positive multiply amounts since the
49170 // negate would prevent it from being used as an address mode anyway.
49171 std::swap(MulAmt1, MulAmt2);
49172
49173 if (isPowerOf2_64(MulAmt1))
49174 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49175 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49176 else
49177 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49178 DAG.getConstant(MulAmt1, DL, VT));
49179
49180 if (isPowerOf2_64(MulAmt2))
49181 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49182 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49183 else
49184 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49185 DAG.getConstant(MulAmt2, DL, VT));
49186
49187 // Negate the result.
49188 if (SignMulAmt < 0)
49189 NewMul = DAG.getNegative(NewMul, DL, VT);
49190 } else if (!Subtarget.slowLEA())
49191 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49192 }
49193 if (!NewMul) {
49194 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49195 if (isPowerOf2_64(AbsMulAmt - 1)) {
49196 // (mul x, 2^N + 1) => (add (shl x, N), x)
49197 NewMul = DAG.getNode(
49198 ISD::ADD, DL, VT, N->getOperand(0),
49199 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49200 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49201 if (SignMulAmt < 0)
49202 NewMul = DAG.getNegative(NewMul, DL, VT);
49203 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49204 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49205 NewMul =
49206 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49207 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49208 // To negate, reverse the operands of the subtract.
49209 if (SignMulAmt < 0)
49210 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49211 else
49212 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49213 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49214 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49215 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49216 NewMul =
49217 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49218 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49219 NewMul = DAG.getNode(
49220 ISD::ADD, DL, VT, NewMul,
49221 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49222 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49223 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49224 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49225 NewMul =
49226 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49227 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49228 NewMul = DAG.getNode(
49229 ISD::SUB, DL, VT, NewMul,
49230 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49231 } else if (SignMulAmt >= 0 && VT.isVector() &&
49232 Subtarget.fastImmVectorShift()) {
49233 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49234 uint64_t ShiftAmt1;
49235 std::optional<unsigned> Opc;
49236 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49237 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49238 Opc = ISD::ADD;
49239 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49240 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49241 Opc = ISD::SUB;
49242 }
49243
49244 if (Opc) {
49245 SDValue Shift1 =
49246 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49247 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49248 SDValue Shift2 =
49249 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49250 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49251 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49252 }
49253 }
49254 }
49255
49256 return NewMul;
49257}
49258
49259// Try to form a MULHU or MULHS node by looking for
49260// (srl (mul ext, ext), 16)
49261// TODO: This is X86 specific because we want to be able to handle wide types
49262// before type legalization. But we can only do it if the vector will be
49263// legalized via widening/splitting. Type legalization can't handle promotion
49264// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49265// combiner.
49267 const SDLoc &DL,
49268 const X86Subtarget &Subtarget) {
49269 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49270 "SRL or SRA node is required here!");
49271
49272 if (!Subtarget.hasSSE2())
49273 return SDValue();
49274
49275 // The operation feeding into the shift must be a multiply.
49276 SDValue ShiftOperand = N->getOperand(0);
49277 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
49278 return SDValue();
49279
49280 // Input type should be at least vXi32.
49281 EVT VT = N->getValueType(0);
49282 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49283 return SDValue();
49284
49285 // Need a shift by 16.
49286 APInt ShiftAmt;
49287 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
49288 ShiftAmt != 16)
49289 return SDValue();
49290
49291 SDValue LHS = ShiftOperand.getOperand(0);
49292 SDValue RHS = ShiftOperand.getOperand(1);
49293
49294 unsigned ExtOpc = LHS.getOpcode();
49295 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49296 RHS.getOpcode() != ExtOpc)
49297 return SDValue();
49298
49299 // Peek through the extends.
49300 LHS = LHS.getOperand(0);
49301 RHS = RHS.getOperand(0);
49302
49303 // Ensure the input types match.
49304 EVT MulVT = LHS.getValueType();
49305 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49306 return SDValue();
49307
49308 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49309 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49310
49311 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49312 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49313}
49314
49316 const X86Subtarget &Subtarget) {
49317 using namespace llvm::SDPatternMatch;
49318 SDValue N0 = N->getOperand(0);
49319 SDValue N1 = N->getOperand(1);
49320 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
49321 EVT VT = N0.getValueType();
49322 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49323 SDLoc DL(N);
49324
49325 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49326 // with out-of-bounds clamping.
49327 if (N0.getOpcode() == ISD::VSELECT &&
49328 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49329 SDValue Cond = N0.getOperand(0);
49330 SDValue N00 = N0.getOperand(1);
49331 SDValue N01 = N0.getOperand(2);
49332 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49334 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49335 m_SpecificCondCode(ISD::SETULT)))) {
49336 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
49337 }
49338 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49340 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49341 m_SpecificCondCode(ISD::SETUGE)))) {
49342 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
49343 }
49344 }
49345
49346 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49347 // since the result of setcc_c is all zero's or all ones.
49348 if (VT.isInteger() && !VT.isVector() &&
49349 N1C && N0.getOpcode() == ISD::AND &&
49350 N0.getOperand(1).getOpcode() == ISD::Constant) {
49351 SDValue N00 = N0.getOperand(0);
49352 APInt Mask = N0.getConstantOperandAPInt(1);
49353 Mask <<= N1C->getAPIntValue();
49354 bool MaskOK = false;
49355 // We can handle cases concerning bit-widening nodes containing setcc_c if
49356 // we carefully interrogate the mask to make sure we are semantics
49357 // preserving.
49358 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49359 // of the underlying setcc_c operation if the setcc_c was zero extended.
49360 // Consider the following example:
49361 // zext(setcc_c) -> i32 0x0000FFFF
49362 // c1 -> i32 0x0000FFFF
49363 // c2 -> i32 0x00000001
49364 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49365 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
49366 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
49367 MaskOK = true;
49368 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
49370 MaskOK = true;
49371 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
49372 N00.getOpcode() == ISD::ANY_EXTEND) &&
49374 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
49375 }
49376 if (MaskOK && Mask != 0)
49377 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
49378 }
49379
49380 return SDValue();
49381}
49382
49384 const X86Subtarget &Subtarget) {
49385 using namespace llvm::SDPatternMatch;
49386 SDValue N0 = N->getOperand(0);
49387 SDValue N1 = N->getOperand(1);
49388 EVT VT = N0.getValueType();
49389 unsigned Size = VT.getSizeInBits();
49390 SDLoc DL(N);
49391
49392 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
49393 return V;
49394
49395 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
49396 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
49397 SDValue ShrAmtVal;
49398 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
49400 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
49401 }
49402
49403 // fold (SRA (SHL X, ShlConst), SraConst)
49404 // into (SHL (sext_in_reg X), ShlConst - SraConst)
49405 // or (sext_in_reg X)
49406 // or (SRA (sext_in_reg X), SraConst - ShlConst)
49407 // depending on relation between SraConst and ShlConst.
49408 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
49409 // us to do the sext_in_reg from corresponding bit.
49410
49411 // sexts in X86 are MOVs. The MOVs have the same code size
49412 // as above SHIFTs (only SHIFT on 1 has lower code size).
49413 // However the MOVs have 2 advantages to a SHIFT:
49414 // 1. MOVs can write to a register that differs from source
49415 // 2. MOVs accept memory operands
49416
49417 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
49418 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
49420 return SDValue();
49421
49422 SDValue N00 = N0.getOperand(0);
49423 SDValue N01 = N0.getOperand(1);
49424 APInt ShlConst = N01->getAsAPIntVal();
49425 APInt SraConst = N1->getAsAPIntVal();
49426 EVT CVT = N1.getValueType();
49427
49428 if (CVT != N01.getValueType())
49429 return SDValue();
49430 if (SraConst.isNegative())
49431 return SDValue();
49432
49433 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
49434 unsigned ShiftSize = SVT.getSizeInBits();
49435 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
49436 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
49437 continue;
49438 SDValue NN =
49439 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
49440 if (SraConst.eq(ShlConst))
49441 return NN;
49442 if (SraConst.ult(ShlConst))
49443 return DAG.getNode(ISD::SHL, DL, VT, NN,
49444 DAG.getConstant(ShlConst - SraConst, DL, CVT));
49445 return DAG.getNode(ISD::SRA, DL, VT, NN,
49446 DAG.getConstant(SraConst - ShlConst, DL, CVT));
49447 }
49448 return SDValue();
49449}
49450
49453 const X86Subtarget &Subtarget) {
49454 using namespace llvm::SDPatternMatch;
49455 SDValue N0 = N->getOperand(0);
49456 SDValue N1 = N->getOperand(1);
49457 EVT VT = N0.getValueType();
49458 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49459 SDLoc DL(N);
49460
49461 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
49462 return V;
49463
49464 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49465 // with out-of-bounds clamping.
49466 if (N0.getOpcode() == ISD::VSELECT &&
49467 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
49468 SDValue Cond = N0.getOperand(0);
49469 SDValue N00 = N0.getOperand(1);
49470 SDValue N01 = N0.getOperand(2);
49471 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
49473 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49474 m_SpecificCondCode(ISD::SETULT)))) {
49475 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
49476 }
49477 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
49479 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49480 m_SpecificCondCode(ISD::SETUGE)))) {
49481 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
49482 }
49483 }
49484
49485 // Only do this on the last DAG combine as it can interfere with other
49486 // combines.
49487 if (!DCI.isAfterLegalizeDAG())
49488 return SDValue();
49489
49490 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
49491 // TODO: This is a generic DAG combine that became an x86-only combine to
49492 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
49493 // and-not ('andn').
49494 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
49495 return SDValue();
49496
49497 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
49498 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
49499 if (!ShiftC || !AndC)
49500 return SDValue();
49501
49502 // If we can shrink the constant mask below 8-bits or 32-bits, then this
49503 // transform should reduce code size. It may also enable secondary transforms
49504 // from improved known-bits analysis or instruction selection.
49505 APInt MaskVal = AndC->getAPIntValue();
49506
49507 // If this can be matched by a zero extend, don't optimize.
49508 if (MaskVal.isMask()) {
49509 unsigned TO = MaskVal.countr_one();
49510 if (TO >= 8 && isPowerOf2_32(TO))
49511 return SDValue();
49512 }
49513
49514 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
49515 unsigned OldMaskSize = MaskVal.getSignificantBits();
49516 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
49517 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
49518 (OldMaskSize > 32 && NewMaskSize <= 32)) {
49519 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
49520 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
49521 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
49522 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
49523 }
49524 return SDValue();
49525}
49526
49528 const X86Subtarget &Subtarget) {
49529 unsigned Opcode = N->getOpcode();
49530 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
49531
49532 SDLoc DL(N);
49533 EVT VT = N->getValueType(0);
49534 SDValue N0 = N->getOperand(0);
49535 SDValue N1 = N->getOperand(1);
49536 EVT SrcVT = N0.getValueType();
49537
49538 SDValue BC0 =
49539 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
49540 SDValue BC1 =
49541 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
49542
49543 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
49544 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
49545 // truncation trees that help us avoid lane crossing shuffles.
49546 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
49547 // TODO: We don't handle vXf64 shuffles yet.
49548 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49549 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
49551 SmallVector<int> ShuffleMask, ScaledMask;
49552 SDValue Vec = peekThroughBitcasts(BCSrc);
49553 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
49555 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
49556 // shuffle to a v4X64 width - we can probably relax this in the future.
49557 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
49558 ShuffleOps[0].getValueType().is256BitVector() &&
49559 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
49560 SDValue Lo, Hi;
49561 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49562 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
49563 Lo = DAG.getBitcast(SrcVT, Lo);
49564 Hi = DAG.getBitcast(SrcVT, Hi);
49565 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
49566 Res = DAG.getBitcast(ShufVT, Res);
49567 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
49568 return DAG.getBitcast(VT, Res);
49569 }
49570 }
49571 }
49572 }
49573
49574 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
49575 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49576 // If either/both ops are a shuffle that can scale to v2x64,
49577 // then see if we can perform this as a v4x32 post shuffle.
49578 SmallVector<SDValue> Ops0, Ops1;
49579 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
49580 bool IsShuf0 =
49581 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49582 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49583 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49584 bool IsShuf1 =
49585 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49586 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
49587 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49588 if (IsShuf0 || IsShuf1) {
49589 if (!IsShuf0) {
49590 Ops0.assign({BC0});
49591 ScaledMask0.assign({0, 1});
49592 }
49593 if (!IsShuf1) {
49594 Ops1.assign({BC1});
49595 ScaledMask1.assign({0, 1});
49596 }
49597
49598 SDValue LHS, RHS;
49599 int PostShuffle[4] = {-1, -1, -1, -1};
49600 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
49601 if (M < 0)
49602 return true;
49603 Idx = M % 2;
49604 SDValue Src = Ops[M / 2];
49605 if (!LHS || LHS == Src) {
49606 LHS = Src;
49607 return true;
49608 }
49609 if (!RHS || RHS == Src) {
49610 Idx += 2;
49611 RHS = Src;
49612 return true;
49613 }
49614 return false;
49615 };
49616 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
49617 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
49618 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
49619 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
49620 LHS = DAG.getBitcast(SrcVT, LHS);
49621 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
49622 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49623 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
49624 Res = DAG.getBitcast(ShufVT, Res);
49625 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
49626 return DAG.getBitcast(VT, Res);
49627 }
49628 }
49629 }
49630
49631 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49632 if (VT.is256BitVector() && Subtarget.hasInt256()) {
49633 SmallVector<int> Mask0, Mask1;
49634 SmallVector<SDValue> Ops0, Ops1;
49635 SmallVector<int, 2> ScaledMask0, ScaledMask1;
49636 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49637 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49638 !Ops0.empty() && !Ops1.empty() &&
49639 all_of(Ops0,
49640 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49641 all_of(Ops1,
49642 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49643 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49644 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
49645 SDValue Op00 = peekThroughBitcasts(Ops0.front());
49646 SDValue Op10 = peekThroughBitcasts(Ops1.front());
49647 SDValue Op01 = peekThroughBitcasts(Ops0.back());
49648 SDValue Op11 = peekThroughBitcasts(Ops1.back());
49649 if ((Op00 == Op11) && (Op01 == Op10)) {
49650 std::swap(Op10, Op11);
49652 }
49653 if ((Op00 == Op10) && (Op01 == Op11)) {
49654 const int Map[4] = {0, 2, 1, 3};
49655 SmallVector<int, 4> ShuffleMask(
49656 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
49657 Map[ScaledMask1[1]]});
49658 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
49659 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
49660 DAG.getBitcast(SrcVT, Op01));
49661 Res = DAG.getBitcast(ShufVT, Res);
49662 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
49663 return DAG.getBitcast(VT, Res);
49664 }
49665 }
49666 }
49667
49668 return SDValue();
49669}
49670
49673 const X86Subtarget &Subtarget) {
49674 unsigned Opcode = N->getOpcode();
49675 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
49676 "Unexpected pack opcode");
49677
49678 EVT VT = N->getValueType(0);
49679 SDValue N0 = N->getOperand(0);
49680 SDValue N1 = N->getOperand(1);
49681 unsigned NumDstElts = VT.getVectorNumElements();
49682 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
49683 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
49684 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
49685 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
49686 "Unexpected PACKSS/PACKUS input type");
49687
49688 bool IsSigned = (X86ISD::PACKSS == Opcode);
49689
49690 // Constant Folding.
49691 APInt UndefElts0, UndefElts1;
49692 SmallVector<APInt, 32> EltBits0, EltBits1;
49693 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49694 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49695 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
49696 /*AllowWholeUndefs*/ true,
49697 /*AllowPartialUndefs*/ true) &&
49698 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
49699 /*AllowWholeUndefs*/ true,
49700 /*AllowPartialUndefs*/ true)) {
49701 unsigned NumLanes = VT.getSizeInBits() / 128;
49702 unsigned NumSrcElts = NumDstElts / 2;
49703 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
49704 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
49705
49706 APInt Undefs(NumDstElts, 0);
49707 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
49708 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
49709 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
49710 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
49711 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
49712 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
49713
49714 if (UndefElts[SrcIdx]) {
49715 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
49716 continue;
49717 }
49718
49719 APInt &Val = EltBits[SrcIdx];
49720 if (IsSigned) {
49721 // PACKSS: Truncate signed value with signed saturation.
49722 // Source values less than dst minint are saturated to minint.
49723 // Source values greater than dst maxint are saturated to maxint.
49724 Val = Val.truncSSat(DstBitsPerElt);
49725 } else {
49726 // PACKUS: Truncate signed value with unsigned saturation.
49727 // Source values less than zero are saturated to zero.
49728 // Source values greater than dst maxuint are saturated to maxuint.
49729 // NOTE: This is different from APInt::truncUSat.
49730 if (Val.isIntN(DstBitsPerElt))
49731 Val = Val.trunc(DstBitsPerElt);
49732 else if (Val.isNegative())
49733 Val = APInt::getZero(DstBitsPerElt);
49734 else
49735 Val = APInt::getAllOnes(DstBitsPerElt);
49736 }
49737 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
49738 }
49739 }
49740
49741 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
49742 }
49743
49744 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49745 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49746 return V;
49747
49748 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
49749 // Currently limit this to allsignbits cases only.
49750 if (IsSigned &&
49751 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
49752 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
49753 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
49754 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
49755 if (Not0 && Not1) {
49756 SDLoc DL(N);
49757 MVT SrcVT = N0.getSimpleValueType();
49758 SDValue Pack =
49759 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
49760 DAG.getBitcast(SrcVT, Not1));
49761 return DAG.getNOT(DL, Pack, VT);
49762 }
49763 }
49764
49765 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
49766 // truncate to create a larger truncate.
49767 if (Subtarget.hasAVX512() &&
49768 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
49769 N0.getOperand(0).getValueType() == MVT::v8i32) {
49770 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
49771 (!IsSigned &&
49772 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
49773 if (Subtarget.hasVLX())
49774 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
49775
49776 // Widen input to v16i32 so we can truncate that.
49777 SDLoc dl(N);
49778 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
49779 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
49780 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
49781 }
49782 }
49783
49784 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49785 if (VT.is128BitVector()) {
49786 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49787 SDValue Src0, Src1;
49788 if (N0.getOpcode() == ExtOpc &&
49790 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49791 Src0 = N0.getOperand(0);
49792 }
49793 if (N1.getOpcode() == ExtOpc &&
49795 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49796 Src1 = N1.getOperand(0);
49797 }
49798 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
49799 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
49800 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
49801 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
49802 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
49803 }
49804
49805 // Try again with pack(*_extend_vector_inreg, undef).
49806 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
49808 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
49809 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
49810 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
49811 DAG);
49812 }
49813
49814 // Attempt to combine as shuffle.
49815 SDValue Op(N, 0);
49816 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49817 return Res;
49818
49819 return SDValue();
49820}
49821
49824 const X86Subtarget &Subtarget) {
49825 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
49826 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
49827 "Unexpected horizontal add/sub opcode");
49828
49829 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
49830 MVT VT = N->getSimpleValueType(0);
49831 SDValue LHS = N->getOperand(0);
49832 SDValue RHS = N->getOperand(1);
49833
49834 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49835 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49836 LHS.getOpcode() == RHS.getOpcode() &&
49837 LHS.getValueType() == RHS.getValueType() &&
49838 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49839 SDValue LHS0 = LHS.getOperand(0);
49840 SDValue LHS1 = LHS.getOperand(1);
49841 SDValue RHS0 = RHS.getOperand(0);
49842 SDValue RHS1 = RHS.getOperand(1);
49843 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
49844 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
49845 SDLoc DL(N);
49846 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
49847 LHS0.isUndef() ? LHS1 : LHS0,
49848 RHS0.isUndef() ? RHS1 : RHS0);
49849 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
49850 Res = DAG.getBitcast(ShufVT, Res);
49851 SDValue NewLHS =
49852 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49853 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
49854 SDValue NewRHS =
49855 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49856 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
49857 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49858 DAG.getBitcast(VT, NewRHS));
49859 }
49860 }
49861 }
49862
49863 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49864 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49865 return V;
49866
49867 return SDValue();
49868}
49869
49872 const X86Subtarget &Subtarget) {
49873 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
49874 X86ISD::VSRL == N->getOpcode()) &&
49875 "Unexpected shift opcode");
49876 EVT VT = N->getValueType(0);
49877 SDValue N0 = N->getOperand(0);
49878 SDValue N1 = N->getOperand(1);
49879
49880 // Shift zero -> zero.
49882 return DAG.getConstant(0, SDLoc(N), VT);
49883
49884 // Detect constant shift amounts.
49885 APInt UndefElts;
49886 SmallVector<APInt, 32> EltBits;
49887 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
49888 /*AllowWholeUndefs*/ true,
49889 /*AllowPartialUndefs*/ false)) {
49890 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49891 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
49892 EltBits[0].getZExtValue(), DAG);
49893 }
49894
49895 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49896 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
49897 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
49898 return SDValue(N, 0);
49899
49900 return SDValue();
49901}
49902
49905 const X86Subtarget &Subtarget) {
49906 unsigned Opcode = N->getOpcode();
49907 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
49908 X86ISD::VSRLI == Opcode) &&
49909 "Unexpected shift opcode");
49910 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49911 EVT VT = N->getValueType(0);
49912 SDValue N0 = N->getOperand(0);
49913 SDValue N1 = N->getOperand(1);
49914 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49915 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
49916 "Unexpected value type");
49917 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
49918
49919 // (shift undef, X) -> 0
49920 if (N0.isUndef())
49921 return DAG.getConstant(0, SDLoc(N), VT);
49922
49923 // Out of range logical bit shifts are guaranteed to be zero.
49924 // Out of range arithmetic bit shifts splat the sign bit.
49925 unsigned ShiftVal = N->getConstantOperandVal(1);
49926 if (ShiftVal >= NumBitsPerElt) {
49927 if (LogicalShift)
49928 return DAG.getConstant(0, SDLoc(N), VT);
49929 ShiftVal = NumBitsPerElt - 1;
49930 }
49931
49932 // (shift X, 0) -> X
49933 if (!ShiftVal)
49934 return N0;
49935
49936 // (shift 0, C) -> 0
49938 // N0 is all zeros or undef. We guarantee that the bits shifted into the
49939 // result are all zeros, not undef.
49940 return DAG.getConstant(0, SDLoc(N), VT);
49941
49942 // (VSRAI -1, C) -> -1
49943 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
49944 // N0 is all ones or undef. We guarantee that the bits shifted into the
49945 // result are all ones, not undef.
49946 return DAG.getAllOnesConstant(SDLoc(N), VT);
49947
49948 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
49949 unsigned NewShiftVal = Amt0 + Amt1;
49950 if (NewShiftVal >= NumBitsPerElt) {
49951 // Out of range logical bit shifts are guaranteed to be zero.
49952 // Out of range arithmetic bit shifts splat the sign bit.
49953 if (LogicalShift)
49954 return DAG.getConstant(0, SDLoc(N), VT);
49955 NewShiftVal = NumBitsPerElt - 1;
49956 }
49957 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
49958 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
49959 };
49960
49961 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49962 if (Opcode == N0.getOpcode())
49963 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
49964
49965 // (shl (add X, X), C) -> (shl X, (C + 1))
49966 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
49967 N0.getOperand(0) == N0.getOperand(1))
49968 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
49969
49970 // We can decode 'whole byte' logical bit shifts as shuffles.
49971 if (LogicalShift && (ShiftVal % 8) == 0) {
49972 SDValue Op(N, 0);
49973 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49974 return Res;
49975 }
49976
49977 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
49978 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
49979 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
49980 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
49981 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
49982 N0.getOpcode() == X86ISD::PSHUFD &&
49983 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
49984 N0->hasOneUse()) {
49986 if (BC.getOpcode() == X86ISD::VSHLI &&
49987 BC.getScalarValueSizeInBits() == 64 &&
49988 BC.getConstantOperandVal(1) == 63) {
49989 SDLoc DL(N);
49990 SDValue Src = BC.getOperand(0);
49991 Src = DAG.getBitcast(VT, Src);
49992 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
49993 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
49994 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
49995 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
49996 return Src;
49997 }
49998 }
49999
50000 auto TryConstantFold = [&](SDValue V) {
50001 APInt UndefElts;
50002 SmallVector<APInt, 32> EltBits;
50003 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50004 /*AllowWholeUndefs*/ true,
50005 /*AllowPartialUndefs*/ true))
50006 return SDValue();
50007 assert(EltBits.size() == VT.getVectorNumElements() &&
50008 "Unexpected shift value type");
50009 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50010 // created an undef input due to no input bits being demanded, but user
50011 // still expects 0 in other bits.
50012 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50013 APInt &Elt = EltBits[i];
50014 if (UndefElts[i])
50015 Elt = 0;
50016 else if (X86ISD::VSHLI == Opcode)
50017 Elt <<= ShiftVal;
50018 else if (X86ISD::VSRAI == Opcode)
50019 Elt.ashrInPlace(ShiftVal);
50020 else
50021 Elt.lshrInPlace(ShiftVal);
50022 }
50023 // Reset undef elements since they were zeroed above.
50024 UndefElts = 0;
50025 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50026 };
50027
50028 // Constant Folding.
50029 if (N->isOnlyUserOf(N0.getNode())) {
50030 if (SDValue C = TryConstantFold(N0))
50031 return C;
50032
50033 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50034 // Don't break NOT patterns.
50036 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50037 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50039 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50040 SDLoc DL(N);
50041 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50042 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50043 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50044 }
50045 }
50046 }
50047
50048 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50049 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50050 DCI))
50051 return SDValue(N, 0);
50052
50053 return SDValue();
50054}
50055
50058 const X86Subtarget &Subtarget) {
50059 EVT VT = N->getValueType(0);
50060 unsigned Opcode = N->getOpcode();
50061 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50062 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50063 Opcode == ISD::INSERT_VECTOR_ELT) &&
50064 "Unexpected vector insertion");
50065
50066 SDValue Vec = N->getOperand(0);
50067 SDValue Scl = N->getOperand(1);
50068 SDValue Idx = N->getOperand(2);
50069
50070 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50071 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50072 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50073
50074 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50075 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50076 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50077 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50078 APInt::getAllOnes(NumBitsPerElt), DCI))
50079 return SDValue(N, 0);
50080 }
50081
50082 // Attempt to combine insertion patterns to a shuffle.
50083 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50084 SDValue Op(N, 0);
50085 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50086 return Res;
50087 }
50088
50089 return SDValue();
50090}
50091
50092/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50093/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50094/// OR -> CMPNEQSS.
50097 const X86Subtarget &Subtarget) {
50098 unsigned opcode;
50099
50100 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50101 // we're requiring SSE2 for both.
50102 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50103 SDValue N0 = N->getOperand(0);
50104 SDValue N1 = N->getOperand(1);
50105 SDValue CMP0 = N0.getOperand(1);
50106 SDValue CMP1 = N1.getOperand(1);
50107 SDLoc DL(N);
50108
50109 // The SETCCs should both refer to the same CMP.
50110 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50111 return SDValue();
50112
50113 SDValue CMP00 = CMP0->getOperand(0);
50114 SDValue CMP01 = CMP0->getOperand(1);
50115 EVT VT = CMP00.getValueType();
50116
50117 if (VT == MVT::f32 || VT == MVT::f64 ||
50118 (VT == MVT::f16 && Subtarget.hasFP16())) {
50119 bool ExpectingFlags = false;
50120 // Check for any users that want flags:
50121 for (const SDNode *U : N->users()) {
50122 if (ExpectingFlags)
50123 break;
50124
50125 switch (U->getOpcode()) {
50126 default:
50127 case ISD::BR_CC:
50128 case ISD::BRCOND:
50129 case ISD::SELECT:
50130 ExpectingFlags = true;
50131 break;
50132 case ISD::CopyToReg:
50133 case ISD::SIGN_EXTEND:
50134 case ISD::ZERO_EXTEND:
50135 case ISD::ANY_EXTEND:
50136 break;
50137 }
50138 }
50139
50140 if (!ExpectingFlags) {
50141 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50142 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50143
50144 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50145 X86::CondCode tmp = cc0;
50146 cc0 = cc1;
50147 cc1 = tmp;
50148 }
50149
50150 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50151 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50152 // FIXME: need symbolic constants for these magic numbers.
50153 // See X86ATTInstPrinter.cpp:printSSECC().
50154 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50155 if (Subtarget.hasAVX512()) {
50156 SDValue FSetCC =
50157 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50158 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50159 // Need to fill with zeros to ensure the bitcast will produce zeroes
50160 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50161 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50162 DAG.getConstant(0, DL, MVT::v16i1),
50163 FSetCC, DAG.getVectorIdxConstant(0, DL));
50164 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50165 N->getSimpleValueType(0));
50166 }
50167 SDValue OnesOrZeroesF =
50168 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50169 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50170
50171 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50172 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50173
50174 if (is64BitFP && !Subtarget.is64Bit()) {
50175 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50176 // 64-bit integer, since that's not a legal type. Since
50177 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50178 // bits, but can do this little dance to extract the lowest 32 bits
50179 // and work with those going forward.
50180 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50181 MVT::v2f64, OnesOrZeroesF);
50182 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50183 OnesOrZeroesF =
50184 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50185 DAG.getVectorIdxConstant(0, DL));
50186 IntVT = MVT::i32;
50187 }
50188
50189 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50190 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50191 DAG.getConstant(1, DL, IntVT));
50192 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50193 ANDed);
50194 return OneBitOfTruth;
50195 }
50196 }
50197 }
50198 }
50199 return SDValue();
50200}
50201
50202/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50204 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50205
50206 MVT VT = N->getSimpleValueType(0);
50207 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50208 return SDValue();
50209
50210 SDValue X, Y;
50211 SDValue N0 = N->getOperand(0);
50212 SDValue N1 = N->getOperand(1);
50213
50214 if (SDValue Not = IsNOT(N0, DAG)) {
50215 X = Not;
50216 Y = N1;
50217 } else if (SDValue Not = IsNOT(N1, DAG)) {
50218 X = Not;
50219 Y = N0;
50220 } else
50221 return SDValue();
50222
50223 X = DAG.getBitcast(VT, X);
50224 Y = DAG.getBitcast(VT, Y);
50225 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
50226}
50227
50228/// Try to fold:
50229/// and (vector_shuffle<Z,...,Z>
50230/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50231/// ->
50232/// andnp (vector_shuffle<Z,...,Z>
50233/// (insert_vector_elt undef, X, Z), undef), Y
50235 const X86Subtarget &Subtarget) {
50236 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50237
50238 EVT VT = N->getValueType(0);
50239 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50240 // value and require extra moves.
50241 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50242 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50243 return SDValue();
50244
50245 auto GetNot = [&DAG](SDValue V) {
50246 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
50247 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50248 // end-users are ISD::AND including cases
50249 // (and(extract_vector_element(SVN), Y)).
50250 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50251 !SVN->getOperand(1).isUndef()) {
50252 return SDValue();
50253 }
50254 SDValue IVEN = SVN->getOperand(0);
50255 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50256 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50257 return SDValue();
50258 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50259 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50260 return SDValue();
50261 SDValue Src = IVEN.getOperand(1);
50262 if (SDValue Not = IsNOT(Src, DAG)) {
50263 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50264 SDValue NotIVEN =
50266 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50267 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50268 SVN->getOperand(1), SVN->getMask());
50269 }
50270 return SDValue();
50271 };
50272
50273 SDValue X, Y;
50274 SDValue N0 = N->getOperand(0);
50275 SDValue N1 = N->getOperand(1);
50276 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50277
50278 if (SDValue Not = GetNot(N0)) {
50279 X = Not;
50280 Y = N1;
50281 } else if (SDValue Not = GetNot(N1)) {
50282 X = Not;
50283 Y = N0;
50284 } else
50285 return SDValue();
50286
50287 X = DAG.getBitcast(VT, X);
50288 Y = DAG.getBitcast(VT, Y);
50289 SDLoc DL(N);
50290
50291 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50292 // AVX2.
50293 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50295 SDValue LoX, HiX;
50296 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50297 SDValue LoY, HiY;
50298 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50299 EVT SplitVT = LoX.getValueType();
50300 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50301 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50302 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50303 }
50304
50305 if (TLI.isTypeLegal(VT))
50306 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50307
50308 return SDValue();
50309}
50310
50311// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50312// logical operations, like in the example below.
50313// or (and (truncate x, truncate y)),
50314// (xor (truncate z, build_vector (constants)))
50315// Given a target type \p VT, we generate
50316// or (and x, y), (xor z, zext(build_vector (constants)))
50317// given x, y and z are of type \p VT. We can do so, if operands are either
50318// truncates from VT types, the second operand is a vector of constants or can
50319// be recursively promoted.
50321 SelectionDAG &DAG, unsigned Depth) {
50322 // Limit recursion to avoid excessive compile times.
50324 return SDValue();
50325
50326 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50327 return SDValue();
50328
50329 SDValue N0 = N.getOperand(0);
50330 SDValue N1 = N.getOperand(1);
50331
50332 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50333 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
50334 return SDValue();
50335
50336 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
50337 N0 = NN0;
50338 else {
50339 // The left side has to be a trunc.
50340 if (N0.getOpcode() != ISD::TRUNCATE)
50341 return SDValue();
50342
50343 // The type of the truncated inputs.
50344 if (N0.getOperand(0).getValueType() != VT)
50345 return SDValue();
50346
50347 N0 = N0.getOperand(0);
50348 }
50349
50350 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
50351 N1 = NN1;
50352 else {
50353 // The right side has to be a 'trunc' or a (foldable) constant.
50354 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
50355 N1.getOperand(0).getValueType() == VT;
50356 if (RHSTrunc)
50357 N1 = N1.getOperand(0);
50358 else if (SDValue Cst =
50360 N1 = Cst;
50361 else
50362 return SDValue();
50363 }
50364
50365 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
50366}
50367
50368// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
50369// register. In most cases we actually compare or select YMM-sized registers
50370// and mixing the two types creates horrible code. This method optimizes
50371// some of the transition sequences.
50372// Even with AVX-512 this is still useful for removing casts around logical
50373// operations on vXi1 mask types.
50375 SelectionDAG &DAG,
50376 const X86Subtarget &Subtarget) {
50377 EVT VT = N.getValueType();
50378 assert(VT.isVector() && "Expected vector type");
50379 assert((N.getOpcode() == ISD::ANY_EXTEND ||
50380 N.getOpcode() == ISD::ZERO_EXTEND ||
50381 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
50382
50383 SDValue Narrow = N.getOperand(0);
50384 EVT NarrowVT = Narrow.getValueType();
50385
50386 // Generate the wide operation.
50387 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
50388 if (!Op)
50389 return SDValue();
50390 switch (N.getOpcode()) {
50391 default: llvm_unreachable("Unexpected opcode");
50392 case ISD::ANY_EXTEND:
50393 return Op;
50394 case ISD::ZERO_EXTEND:
50395 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
50396 case ISD::SIGN_EXTEND:
50397 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
50398 Op, DAG.getValueType(NarrowVT));
50399 }
50400}
50401
50402static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
50403 unsigned FPOpcode;
50404 switch (Opcode) {
50405 // clang-format off
50406 default: llvm_unreachable("Unexpected input node for FP logic conversion");
50407 case ISD::AND: FPOpcode = X86ISD::FAND; break;
50408 case ISD::OR: FPOpcode = X86ISD::FOR; break;
50409 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
50410 // clang-format on
50411 }
50412 return FPOpcode;
50413}
50414
50415/// If both input operands of a logic op are being cast from floating-point
50416/// types or FP compares, try to convert this into a floating-point logic node
50417/// to avoid unnecessary moves from SSE to integer registers.
50418static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
50419 SDValue N0, SDValue N1,
50420 SelectionDAG &DAG,
50422 const X86Subtarget &Subtarget) {
50423 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50424 "Unexpected bit opcode");
50425
50426 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
50427 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
50428 return SDValue();
50429
50430 SDValue N00 = N0.getOperand(0);
50431 SDValue N10 = N1.getOperand(0);
50432 EVT N00Type = N00.getValueType();
50433 EVT N10Type = N10.getValueType();
50434
50435 // Ensure that both types are the same and are legal scalar fp types.
50436 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
50437 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
50438 (Subtarget.hasFP16() && N00Type == MVT::f16)))
50439 return SDValue();
50440
50441 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
50442 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
50443 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
50444 return DAG.getBitcast(VT, FPLogic);
50445 }
50446
50447 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
50448 !N1.hasOneUse())
50449 return SDValue();
50450
50451 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
50452 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
50453
50454 // The vector ISA for FP predicates is incomplete before AVX, so converting
50455 // COMIS* to CMPS* may not be a win before AVX.
50456 if (!Subtarget.hasAVX() &&
50457 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
50458 return SDValue();
50459
50460 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
50461 // and vector logic:
50462 // logic (setcc N00, N01), (setcc N10, N11) -->
50463 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
50464 unsigned NumElts = 128 / N00Type.getSizeInBits();
50465 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
50466 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
50467 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
50468 SDValue N01 = N0.getOperand(1);
50469 SDValue N11 = N1.getOperand(1);
50470 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
50471 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
50472 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
50473 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
50474 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
50475 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
50476 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
50477 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
50478}
50479
50480// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
50481// to reduce XMM->GPR traffic.
50482static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
50483 SDValue N1, SelectionDAG &DAG) {
50484 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50485 "Unexpected bit opcode");
50486
50487 // Both operands must be single use MOVMSK.
50488 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
50489 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
50490 return SDValue();
50491
50492 SDValue Vec0 = N0.getOperand(0);
50493 SDValue Vec1 = N1.getOperand(0);
50494 EVT VecVT0 = Vec0.getValueType();
50495 EVT VecVT1 = Vec1.getValueType();
50496
50497 // Both MOVMSK operands must be from vectors of the same size and same element
50498 // size, but its OK for a fp/int diff.
50499 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
50500 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
50501 return SDValue();
50502
50503 unsigned VecOpc =
50504 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
50505 SDValue Result =
50506 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
50507 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
50508}
50509
50510// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
50511// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
50512// handles in InstCombine.
50513static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
50514 SDValue N0, SDValue N1,
50515 SelectionDAG &DAG) {
50516 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50517 "Unexpected bit opcode");
50518
50519 // Both operands must be single use.
50520 if (!N0.hasOneUse() || !N1.hasOneUse())
50521 return SDValue();
50522
50523 // Search for matching shifts.
50526
50527 unsigned BCOpc = BC0.getOpcode();
50528 EVT BCVT = BC0.getValueType();
50529 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
50530 return SDValue();
50531
50532 switch (BCOpc) {
50533 case X86ISD::VSHLI:
50534 case X86ISD::VSRLI:
50535 case X86ISD::VSRAI: {
50536 if (BC0.getOperand(1) != BC1.getOperand(1))
50537 return SDValue();
50538 SDValue BitOp =
50539 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
50540 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
50541 return DAG.getBitcast(VT, Shift);
50542 }
50543 }
50544
50545 return SDValue();
50546}
50547
50548// Attempt to fold:
50549// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
50550// TODO: Handle PACKUS handling.
50551static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
50552 SDValue N0, SDValue N1, SelectionDAG &DAG) {
50553 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50554 "Unexpected bit opcode");
50555
50556 // Both operands must be single use.
50557 if (!N0.hasOneUse() || !N1.hasOneUse())
50558 return SDValue();
50559
50560 // Search for matching packs.
50563
50564 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
50565 return SDValue();
50566
50567 MVT DstVT = N0.getSimpleValueType();
50568 if (DstVT != N1.getSimpleValueType())
50569 return SDValue();
50570
50571 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
50572 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
50573
50574 // Limit to allsignbits packing.
50575 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
50576 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
50577 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
50578 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
50579 return SDValue();
50580
50581 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
50582 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
50583 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
50584}
50585
50586/// If this is a zero/all-bits result that is bitwise-anded with a low bits
50587/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
50588/// with a shift-right to eliminate loading the vector constant mask value.
50590 const X86Subtarget &Subtarget) {
50591 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
50592 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
50593 EVT VT = Op0.getValueType();
50594 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
50595 return SDValue();
50596
50597 // Try to convert an "is positive" signbit masking operation into arithmetic
50598 // shift and "andn". This saves a materialization of a -1 vector constant.
50599 // The "is negative" variant should be handled more generally because it only
50600 // requires "and" rather than "andn":
50601 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
50602 //
50603 // This is limited to the original type to avoid producing even more bitcasts.
50604 // If the bitcasts can't be eliminated, then it is unlikely that this fold
50605 // will be profitable.
50606 if (N->getValueType(0) == VT &&
50607 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
50608 SDValue X, Y;
50609 if (Op1.getOpcode() == X86ISD::PCMPGT &&
50610 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
50611 X = Op1.getOperand(0);
50612 Y = Op0;
50613 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
50614 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
50615 X = Op0.getOperand(0);
50616 Y = Op1;
50617 }
50618 if (X && Y) {
50619 SDLoc DL(N);
50620 SDValue Sra =
50622 VT.getScalarSizeInBits() - 1, DAG);
50623 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
50624 }
50625 }
50626
50627 APInt SplatVal;
50628 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
50629 return SDValue();
50630
50631 // Don't prevent creation of ANDN.
50632 if (isBitwiseNot(Op0))
50633 return SDValue();
50634
50635 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
50636 return SDValue();
50637
50638 unsigned EltBitWidth = VT.getScalarSizeInBits();
50639 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
50640 return SDValue();
50641
50642 SDLoc DL(N);
50643 unsigned ShiftVal = SplatVal.countr_one();
50644 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
50645 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
50646 return DAG.getBitcast(N->getValueType(0), Shift);
50647}
50648
50649// Get the index node from the lowered DAG of a GEP IR instruction with one
50650// indexing dimension.
50652 if (Ld->isIndexed())
50653 return SDValue();
50654
50655 SDValue Base = Ld->getBasePtr();
50656 if (Base.getOpcode() != ISD::ADD)
50657 return SDValue();
50658
50659 SDValue ShiftedIndex = Base.getOperand(0);
50660 if (ShiftedIndex.getOpcode() != ISD::SHL)
50661 return SDValue();
50662
50663 return ShiftedIndex.getOperand(0);
50664}
50665
50666static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
50667 return Subtarget.hasBMI2() &&
50668 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
50669}
50670
50671/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
50672/// This undoes the inverse fold performed in InstCombine
50674
50675 using namespace llvm::SDPatternMatch;
50676 MVT VT = N->getSimpleValueType(0);
50677 SDLoc DL(N);
50678 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50679 if (!TLI.hasAndNot(SDValue(N, 0)))
50680 return SDValue();
50681
50682 SDValue X, Y, Z;
50683 if (sd_match(N, m_And(m_Value(X),
50684 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
50685 // Don't fold if Y or Z are constants to prevent infinite loops.
50688 return DAG.getNode(
50689 ISD::AND, DL, VT, X,
50690 DAG.getNOT(
50691 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
50692 }
50693
50694 return SDValue();
50695}
50696
50697// This function recognizes cases where X86 bzhi instruction can replace and
50698// 'and-load' sequence.
50699// In case of loading integer value from an array of constants which is defined
50700// as follows:
50701//
50702// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50703//
50704// then applying a bitwise and on the result with another input.
50705// It's equivalent to performing bzhi (zero high bits) on the input, with the
50706// same index of the load.
50708 const X86Subtarget &Subtarget) {
50709 MVT VT = Node->getSimpleValueType(0);
50710 SDLoc dl(Node);
50711
50712 // Check if subtarget has BZHI instruction for the node's type
50713 if (!hasBZHI(Subtarget, VT))
50714 return SDValue();
50715
50716 // Try matching the pattern for both operands.
50717 for (unsigned i = 0; i < 2; i++) {
50718 // continue if the operand is not a load instruction
50719 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
50720 if (!Ld)
50721 continue;
50722 const Value *MemOp = Ld->getMemOperand()->getValue();
50723 if (!MemOp)
50724 continue;
50725 // Get the Node which indexes into the array.
50727 if (!Index)
50728 continue;
50729
50730 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
50731 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50732 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50733 Constant *Init = GV->getInitializer();
50734 Type *Ty = Init->getType();
50735 if (!isa<ConstantDataArray>(Init) ||
50736 !Ty->getArrayElementType()->isIntegerTy() ||
50738 VT.getSizeInBits() ||
50739 Ty->getArrayNumElements() >
50741 continue;
50742
50743 // Check if the array's constant elements are suitable to our case.
50744 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50745 bool ConstantsMatch = true;
50746 for (uint64_t j = 0; j < ArrayElementCount; j++) {
50747 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50748 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50749 ConstantsMatch = false;
50750 break;
50751 }
50752 }
50753 if (!ConstantsMatch)
50754 continue;
50755
50756 // Do the transformation (For 32-bit type):
50757 // -> (and (load arr[idx]), inp)
50758 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50759 // that will be replaced with one bzhi instruction.
50760 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
50761 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
50762
50763 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
50764 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
50765 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
50766
50767 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
50768 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
50769 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
50770 }
50771 }
50772 }
50773 }
50774 return SDValue();
50775}
50776
50777// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
50778// Where C is a mask containing the same number of bits as the setcc and
50779// where the setcc will freely 0 upper bits of k-register. We can replace the
50780// undef in the concat with 0s and remove the AND. This mainly helps with
50781// v2i1/v4i1 setcc being casted to scalar.
50783 const X86Subtarget &Subtarget) {
50784 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
50785
50786 EVT VT = N->getValueType(0);
50787
50788 // Make sure this is an AND with constant. We will check the value of the
50789 // constant later.
50790 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50791 if (!C1)
50792 return SDValue();
50793
50794 // This is implied by the ConstantSDNode.
50795 assert(!VT.isVector() && "Expected scalar VT!");
50796
50797 SDValue Src = N->getOperand(0);
50798 if (!Src.hasOneUse())
50799 return SDValue();
50800
50801 // (Optionally) peek through any_extend().
50802 if (Src.getOpcode() == ISD::ANY_EXTEND) {
50803 if (!Src.getOperand(0).hasOneUse())
50804 return SDValue();
50805 Src = Src.getOperand(0);
50806 }
50807
50808 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
50809 return SDValue();
50810
50811 Src = Src.getOperand(0);
50812 EVT SrcVT = Src.getValueType();
50813
50814 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50815 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
50816 !TLI.isTypeLegal(SrcVT))
50817 return SDValue();
50818
50819 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
50820 return SDValue();
50821
50822 // We only care about the first subvector of the concat, we expect the
50823 // other subvectors to be ignored due to the AND if we make the change.
50824 SDValue SubVec = Src.getOperand(0);
50825 EVT SubVecVT = SubVec.getValueType();
50826
50827 // The RHS of the AND should be a mask with as many bits as SubVec.
50828 if (!TLI.isTypeLegal(SubVecVT) ||
50829 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50830 return SDValue();
50831
50832 // First subvector should be a setcc with a legal result type or a
50833 // AND containing at least one setcc with a legal result type.
50834 auto IsLegalSetCC = [&](SDValue V) {
50835 if (V.getOpcode() != ISD::SETCC)
50836 return false;
50837 EVT SetccVT = V.getOperand(0).getValueType();
50838 if (!TLI.isTypeLegal(SetccVT) ||
50839 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
50840 return false;
50841 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
50842 return false;
50843 return true;
50844 };
50845 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
50846 (IsLegalSetCC(SubVec.getOperand(0)) ||
50847 IsLegalSetCC(SubVec.getOperand(1))))))
50848 return SDValue();
50849
50850 // We passed all the checks. Rebuild the concat_vectors with zeroes
50851 // and cast it back to VT.
50852 SDLoc dl(N);
50853 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
50854 DAG.getConstant(0, dl, SubVecVT));
50855 Ops[0] = SubVec;
50856 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
50857 Ops);
50858 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
50859 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
50860}
50861
50862static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
50863 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50864 // We don't want to go crazy with the recursion here. This isn't a super
50865 // important optimization.
50866 static constexpr unsigned kMaxDepth = 2;
50867
50868 // Only do this re-ordering if op has one use.
50869 if (!Op.hasOneUse())
50870 return SDValue();
50871
50872 SDLoc DL(Op);
50873 // If we hit another assosiative op, recurse further.
50874 if (Op.getOpcode() == Opc) {
50875 // Done recursing.
50876 if (Depth++ >= kMaxDepth)
50877 return SDValue();
50878
50879 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50880 if (SDValue R =
50881 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50882 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
50883 Op.getOperand(1 - OpIdx));
50884
50885 } else if (Op.getOpcode() == ISD::SUB) {
50886 if (Opc == ISD::AND) {
50887 // BLSI: (and x, (sub 0, x))
50888 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
50889 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50890 }
50891 // Opc must be ISD::AND or ISD::XOR
50892 // BLSR: (and x, (sub x, 1))
50893 // BLSMSK: (xor x, (sub x, 1))
50894 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50895 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50896
50897 } else if (Op.getOpcode() == ISD::ADD) {
50898 // Opc must be ISD::AND or ISD::XOR
50899 // BLSR: (and x, (add x, -1))
50900 // BLSMSK: (xor x, (add x, -1))
50901 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50902 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50903 }
50904 return SDValue();
50905}
50906
50908 const X86Subtarget &Subtarget) {
50909 EVT VT = N->getValueType(0);
50910 // Make sure this node is a candidate for BMI instructions.
50911 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
50912 (VT != MVT::i32 && VT != MVT::i64))
50913 return SDValue();
50914
50915 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
50916
50917 // Try and match LHS and RHS.
50918 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50919 if (SDValue OpMatch =
50920 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50921 N->getOperand(1 - OpIdx), 0))
50922 return OpMatch;
50923 return SDValue();
50924}
50925
50927 SelectionDAG &DAG,
50929 const X86Subtarget &ST) {
50930 // cmp(setcc(cc, X), 0)
50931 // brcond ne
50932 // ->
50933 // X
50934 // brcond cc
50935
50936 // sub(setcc(cc, X), 1)
50937 // brcond ne
50938 // ->
50939 // X
50940 // brcond ~cc
50941 //
50942 // if only flag has users
50943
50944 SDValue SetCC = N->getOperand(0);
50945
50946 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
50947 return SDValue();
50948
50949 // Check the only user of flag is `brcond ne`.
50950 SDNode *BrCond = *Flag->user_begin();
50951 if (BrCond->getOpcode() != X86ISD::BRCOND)
50952 return SDValue();
50953 unsigned CondNo = 2;
50954 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
50956 return SDValue();
50957
50958 SDValue X = SetCC.getOperand(1);
50959 // sub has two results while X only have one. DAG combine assumes the value
50960 // type matches.
50961 if (N->getOpcode() == X86ISD::SUB)
50962 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
50963
50964 SDValue CCN = SetCC.getOperand(0);
50966 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
50968 // Update CC for the consumer of the flag.
50969 // The old CC is `ne`. Hence, when comparing the result with 0, we are
50970 // checking if the second condition evaluates to true. When comparing the
50971 // result with 1, we are checking uf the second condition evaluates to false.
50972 SmallVector<SDValue> Ops(BrCond->op_values());
50973 if (isNullConstant(N->getOperand(1)))
50974 Ops[CondNo] = CCN;
50975 else if (isOneConstant(N->getOperand(1)))
50976 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
50977 else
50978 llvm_unreachable("expect constant 0 or 1");
50979
50980 SDValue NewBrCond =
50981 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
50982 // Avoid self-assign error b/c CC1 can be `e/ne`.
50983 if (BrCond != NewBrCond.getNode())
50984 DCI.CombineTo(BrCond, NewBrCond);
50985 return X;
50986}
50987
50990 const X86Subtarget &ST) {
50991 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
50992 // ->
50993 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
50994
50995 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
50996 // ->
50997 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
50998 //
50999 // where cflags is determined by cc1.
51000
51001 if (!ST.hasCCMP())
51002 return SDValue();
51003
51004 SDValue SetCC0 = N->getOperand(0);
51005 SDValue SetCC1 = N->getOperand(1);
51006 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51007 SetCC1.getOpcode() != X86ISD::SETCC)
51008 return SDValue();
51009
51010 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51011 SDValue Op = V.getOperand(1);
51012 unsigned Opc = Op.getOpcode();
51013 if (Opc == X86ISD::SUB)
51014 return X86ISD::CCMP;
51015 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51016 return X86ISD::CTEST;
51017 return 0U;
51018 };
51019
51020 unsigned NewOpc = 0;
51021
51022 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51023 // appear on the right.
51024 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51025 std::swap(SetCC0, SetCC1);
51026 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51027 return SDValue();
51028 }
51029
51030 X86::CondCode CC0 =
51031 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51032 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51033 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51034 return SDValue();
51035
51036 bool IsOR = N->getOpcode() == ISD::OR;
51037
51038 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51039 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51040 // operator is OR. Similar for CC1.
51041 SDValue SrcCC =
51043 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51044 : SetCC0.getOperand(0);
51045 SDValue CC1N = SetCC1.getOperand(0);
51046 X86::CondCode CC1 =
51047 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51049 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51050 SDLoc DL(N);
51051 SDValue CFlags = DAG.getTargetConstant(
51052 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51053 SDValue Sub = SetCC1.getOperand(1);
51054
51055 // Replace any uses of the old flag produced by SUB/CMP with the new one
51056 // produced by CCMP/CTEST.
51057 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51058 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51059 {Sub.getOperand(0), Sub.getOperand(1),
51060 CFlags, SrcCC, SetCC0.getOperand(1)})
51061 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51062 {Sub.getOperand(0), Sub.getOperand(0),
51063 CFlags, SrcCC, SetCC0.getOperand(1)});
51064
51065 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51066}
51067
51070 const X86Subtarget &Subtarget) {
51071 SDValue N0 = N->getOperand(0);
51072 SDValue N1 = N->getOperand(1);
51073 EVT VT = N->getValueType(0);
51074 SDLoc dl(N);
51075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51076
51077 // If this is SSE1 only convert to FAND to avoid scalarization.
51078 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51079 return DAG.getBitcast(MVT::v4i32,
51080 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51081 DAG.getBitcast(MVT::v4f32, N0),
51082 DAG.getBitcast(MVT::v4f32, N1)));
51083 }
51084
51085 // Use a 32-bit and+zext if upper bits known zero.
51086 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51087 APInt HiMask = APInt::getHighBitsSet(64, 32);
51088 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51089 DAG.MaskedValueIsZero(N0, HiMask)) {
51090 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51091 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51092 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51093 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51094 }
51095 }
51096
51097 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51098 // TODO: Support multiple SrcOps.
51099 if (VT == MVT::i1) {
51101 SmallVector<APInt, 2> SrcPartials;
51102 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51103 SrcOps.size() == 1) {
51104 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51105 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51106 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51107 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51108 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51109 if (Mask) {
51110 assert(SrcPartials[0].getBitWidth() == NumElts &&
51111 "Unexpected partial reduction mask");
51112 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51113 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51114 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51115 }
51116 }
51117 }
51118
51119 // InstCombine converts:
51120 // `(-x << C0) & C1`
51121 // to
51122 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51123 // This saves an IR instruction but on x86 the neg/shift version is preferable
51124 // so undo the transform.
51125
51126 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51127 // TODO: We don't actually need a splat for this, we just need the checks to
51128 // hold for each element.
51129 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51130 /*AllowTruncation*/ false);
51131 ConstantSDNode *N01C =
51132 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51133 /*AllowTruncation*/ false);
51134 if (N1C && N01C) {
51135 const APInt &MulC = N01C->getAPIntValue();
51136 const APInt &AndC = N1C->getAPIntValue();
51137 APInt MulCLowBit = MulC & (-MulC);
51138 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51139 (MulCLowBit + MulC).isPowerOf2()) {
51140 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51141 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51142 assert(MulCLowBitLog != -1 &&
51143 "Isolated lowbit is somehow not a power of 2!");
51144 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51145 DAG.getConstant(MulCLowBitLog, dl, VT));
51146 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51147 }
51148 }
51149 }
51150
51151 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51152 return SetCC;
51153
51154 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51155 return V;
51156
51157 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51158 return R;
51159
51160 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51161 return R;
51162
51163 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51164 return R;
51165
51166 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51167 DAG, DCI, Subtarget))
51168 return FPLogic;
51169
51170 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51171 return R;
51172
51173 if (DCI.isBeforeLegalizeOps())
51174 return SDValue();
51175
51176 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51177 return R;
51178
51179 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
51180 return R;
51181
51182 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
51183 return ShiftRight;
51184
51185 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51186 return R;
51187
51189 return R;
51190
51191 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51192 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51193 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51194 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51195 unsigned Opc0 = N0.getOpcode();
51196 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51198 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51199 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51200 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51201 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51202 }
51203 }
51204
51205 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51206 // avoids slow variable shift (moving shift amount to ECX etc.)
51207 if (isOneConstant(N1) && N0->hasOneUse()) {
51208 SDValue Src = N0;
51209 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51210 Src.getOpcode() == ISD::TRUNCATE) &&
51211 Src.getOperand(0)->hasOneUse())
51212 Src = Src.getOperand(0);
51213 bool ContainsNOT = false;
51214 X86::CondCode X86CC = X86::COND_B;
51215 // Peek through AND(NOT(SRL(X,Y)),1).
51216 if (isBitwiseNot(Src)) {
51217 Src = Src.getOperand(0);
51218 X86CC = X86::COND_AE;
51219 ContainsNOT = true;
51220 }
51221 if (Src.getOpcode() == ISD::SRL &&
51222 !isa<ConstantSDNode>(Src.getOperand(1))) {
51223 SDValue BitNo = Src.getOperand(1);
51224 Src = Src.getOperand(0);
51225 // Peek through AND(SRL(NOT(X),Y),1).
51226 if (isBitwiseNot(Src)) {
51227 Src = Src.getOperand(0);
51228 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51229 ContainsNOT = true;
51230 }
51231 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51232 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51233 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51234 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51235 }
51236 }
51237
51238 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51239 // Attempt to recursively combine a bitmask AND with shuffles.
51240 SDValue Op(N, 0);
51241 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51242 return Res;
51243
51244 // If either operand is a constant mask, then only the elements that aren't
51245 // zero are actually demanded by the other operand.
51246 auto GetDemandedMasks = [&](SDValue Op) {
51247 APInt UndefElts;
51248 SmallVector<APInt> EltBits;
51249 int NumElts = VT.getVectorNumElements();
51250 int EltSizeInBits = VT.getScalarSizeInBits();
51251 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51252 APInt DemandedElts = APInt::getAllOnes(NumElts);
51253 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51254 EltBits)) {
51255 DemandedBits.clearAllBits();
51256 DemandedElts.clearAllBits();
51257 for (int I = 0; I != NumElts; ++I) {
51258 if (UndefElts[I]) {
51259 // We can't assume an undef src element gives an undef dst - the
51260 // other src might be zero.
51261 DemandedBits.setAllBits();
51262 DemandedElts.setBit(I);
51263 } else if (!EltBits[I].isZero()) {
51264 DemandedBits |= EltBits[I];
51265 DemandedElts.setBit(I);
51266 }
51267 }
51268 }
51269 return std::make_pair(DemandedBits, DemandedElts);
51270 };
51271 APInt Bits0, Elts0;
51272 APInt Bits1, Elts1;
51273 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51274 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51275
51276 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51277 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51278 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51279 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51280 if (N->getOpcode() != ISD::DELETED_NODE)
51281 DCI.AddToWorklist(N);
51282 return SDValue(N, 0);
51283 }
51284
51285 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
51286 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
51287 if (NewN0 || NewN1)
51288 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
51289 NewN1 ? NewN1 : N1);
51290 }
51291
51292 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
51293 if ((VT.getScalarSizeInBits() % 8) == 0 &&
51295 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51296 SDValue BitMask = N1;
51297 SDValue SrcVec = N0.getOperand(0);
51298 EVT SrcVecVT = SrcVec.getValueType();
51299
51300 // Check that the constant bitmask masks whole bytes.
51301 APInt UndefElts;
51302 SmallVector<APInt, 64> EltBits;
51303 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51304 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
51305 llvm::all_of(EltBits, [](const APInt &M) {
51306 return M.isZero() || M.isAllOnes();
51307 })) {
51308 unsigned NumElts = SrcVecVT.getVectorNumElements();
51309 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
51310 unsigned Idx = N0.getConstantOperandVal(1);
51311
51312 // Create a root shuffle mask from the byte mask and the extracted index.
51313 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
51314 for (unsigned i = 0; i != Scale; ++i) {
51315 if (UndefElts[i])
51316 continue;
51317 int VecIdx = Scale * Idx + i;
51318 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
51319 }
51320
51322 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
51324 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
51325 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
51326 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
51327 N0.getOperand(1));
51328 }
51329 }
51330
51331 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
51332 return R;
51333
51334 return SDValue();
51335}
51336
51337// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
51339 SelectionDAG &DAG,
51340 const X86Subtarget &Subtarget) {
51341 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51342
51343 MVT VT = N->getSimpleValueType(0);
51344 unsigned EltSizeInBits = VT.getScalarSizeInBits();
51345 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
51346 return SDValue();
51347
51348 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
51349 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
51350 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
51351 return SDValue();
51352
51353 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
51354 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
51355 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
51356 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
51357 return SDValue();
51358
51359 // Attempt to extract constant byte masks.
51360 APInt UndefElts0, UndefElts1;
51361 SmallVector<APInt, 32> EltBits0, EltBits1;
51362 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
51363 /*AllowWholeUndefs*/ false,
51364 /*AllowPartialUndefs*/ false))
51365 return SDValue();
51366 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
51367 /*AllowWholeUndefs*/ false,
51368 /*AllowPartialUndefs*/ false))
51369 return SDValue();
51370
51371 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
51372 // TODO - add UNDEF elts support.
51373 if (UndefElts0[i] || UndefElts1[i])
51374 return SDValue();
51375 if (EltBits0[i] != ~EltBits1[i])
51376 return SDValue();
51377 }
51378
51379 if (useVPTERNLOG(Subtarget, VT)) {
51380 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
51381 // VPTERNLOG is only available as vXi32/64-bit types.
51382 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
51383 MVT OpVT =
51384 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
51385 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
51386 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
51387 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
51388 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
51389 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
51390 DAG, Subtarget);
51391 return DAG.getBitcast(VT, Res);
51392 }
51393
51394 SDValue X = N->getOperand(0);
51395 SDValue Y =
51396 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
51397 DAG.getBitcast(VT, N1.getOperand(0)));
51398 return DAG.getNode(ISD::OR, DL, VT, X, Y);
51399}
51400
51401// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
51402static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
51403 if (N->getOpcode() != ISD::OR)
51404 return false;
51405
51406 SDValue N0 = N->getOperand(0);
51407 SDValue N1 = N->getOperand(1);
51408
51409 // Canonicalize AND to LHS.
51410 if (N1.getOpcode() == ISD::AND)
51411 std::swap(N0, N1);
51412
51413 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
51414 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
51415 return false;
51416
51417 Mask = N1.getOperand(0);
51418 X = N1.getOperand(1);
51419
51420 // Check to see if the mask appeared in both the AND and ANDNP.
51421 if (N0.getOperand(0) == Mask)
51422 Y = N0.getOperand(1);
51423 else if (N0.getOperand(1) == Mask)
51424 Y = N0.getOperand(0);
51425 else
51426 return false;
51427
51428 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
51429 // ANDNP combine allows other combines to happen that prevent matching.
51430 return true;
51431}
51432
51433// Try to fold:
51434// (or (and (m, y), (pandn m, x)))
51435// into:
51436// (vselect m, x, y)
51437// As a special case, try to fold:
51438// (or (and (m, (sub 0, x)), (pandn m, x)))
51439// into:
51440// (sub (xor X, M), M)
51442 SelectionDAG &DAG,
51443 const X86Subtarget &Subtarget) {
51444 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
51445
51446 EVT VT = N->getValueType(0);
51447 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
51448 (VT.is256BitVector() && Subtarget.hasInt256())))
51449 return SDValue();
51450
51451 SDValue X, Y, Mask;
51452 if (!matchLogicBlend(N, X, Y, Mask))
51453 return SDValue();
51454
51455 // Validate that X, Y, and Mask are bitcasts, and see through them.
51456 Mask = peekThroughBitcasts(Mask);
51459
51460 EVT MaskVT = Mask.getValueType();
51461 unsigned EltBits = MaskVT.getScalarSizeInBits();
51462
51463 // TODO: Attempt to handle floating point cases as well?
51464 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
51465 return SDValue();
51466
51467 // Attempt to combine to conditional negate: (sub (xor X, M), M)
51468 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
51469 DAG, Subtarget))
51470 return Res;
51471
51472 // PBLENDVB is only available on SSE 4.1.
51473 if (!Subtarget.hasSSE41())
51474 return SDValue();
51475
51476 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
51477 if (Subtarget.hasVLX())
51478 return SDValue();
51479
51480 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
51481
51482 X = DAG.getBitcast(BlendVT, X);
51483 Y = DAG.getBitcast(BlendVT, Y);
51484 Mask = DAG.getBitcast(BlendVT, Mask);
51485 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
51486 return DAG.getBitcast(VT, Mask);
51487}
51488
51489// Helper function for combineOrCmpEqZeroToCtlzSrl
51490// Transforms:
51491// seteq(cmp x, 0)
51492// into:
51493// srl(ctlz x), log2(bitsize(x))
51494// Input pattern is checked by caller.
51496 SDValue Cmp = Op.getOperand(1);
51497 EVT VT = Cmp.getOperand(0).getValueType();
51498 unsigned Log2b = Log2_32(VT.getSizeInBits());
51499 SDLoc dl(Op);
51500 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
51501 // The result of the shift is true or false, and on X86, the 32-bit
51502 // encoding of shr and lzcnt is more desirable.
51503 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
51504 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
51505 DAG.getConstant(Log2b, dl, MVT::i8));
51506 return Scc;
51507}
51508
51509// Try to transform:
51510// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
51511// into:
51512// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
51513// Will also attempt to match more generic cases, eg:
51514// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
51515// Only applies if the target supports the FastLZCNT feature.
51518 const X86Subtarget &Subtarget) {
51519 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
51520 return SDValue();
51521
51522 auto isORCandidate = [](SDValue N) {
51523 return (N->getOpcode() == ISD::OR && N->hasOneUse());
51524 };
51525
51526 // Check the zero extend is extending to 32-bit or more. The code generated by
51527 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
51528 // instructions to clear the upper bits.
51529 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
51530 !isORCandidate(N->getOperand(0)))
51531 return SDValue();
51532
51533 // Check the node matches: setcc(eq, cmp 0)
51534 auto isSetCCCandidate = [](SDValue N) {
51535 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
51536 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
51537 N->getOperand(1).getOpcode() == X86ISD::CMP &&
51538 isNullConstant(N->getOperand(1).getOperand(1)) &&
51539 N->getOperand(1).getValueType().bitsGE(MVT::i32);
51540 };
51541
51542 SDNode *OR = N->getOperand(0).getNode();
51543 SDValue LHS = OR->getOperand(0);
51544 SDValue RHS = OR->getOperand(1);
51545
51546 // Save nodes matching or(or, setcc(eq, cmp 0)).
51548 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
51549 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
51550 ORNodes.push_back(OR);
51551 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
51552 LHS = OR->getOperand(0);
51553 RHS = OR->getOperand(1);
51554 }
51555
51556 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
51557 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
51558 !isORCandidate(SDValue(OR, 0)))
51559 return SDValue();
51560
51561 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
51562 // to
51563 // or(srl(ctlz),srl(ctlz)).
51564 // The dag combiner can then fold it into:
51565 // srl(or(ctlz, ctlz)).
51566 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
51567 SDValue Ret, NewRHS;
51568 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
51569 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
51570
51571 if (!Ret)
51572 return SDValue();
51573
51574 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
51575 while (!ORNodes.empty()) {
51576 OR = ORNodes.pop_back_val();
51577 LHS = OR->getOperand(0);
51578 RHS = OR->getOperand(1);
51579 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
51580 if (RHS->getOpcode() == ISD::OR)
51581 std::swap(LHS, RHS);
51582 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
51583 if (!NewRHS)
51584 return SDValue();
51585 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
51586 }
51587
51588 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
51589}
51590
51592 SDValue And1_L, SDValue And1_R,
51593 const SDLoc &DL, SelectionDAG &DAG) {
51594 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
51595 return SDValue();
51596 SDValue NotOp = And0_L->getOperand(0);
51597 if (NotOp == And1_R)
51598 std::swap(And1_R, And1_L);
51599 if (NotOp != And1_L)
51600 return SDValue();
51601
51602 // (~(NotOp) & And0_R) | (NotOp & And1_R)
51603 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
51604 EVT VT = And1_L->getValueType(0);
51605 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
51606 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
51607 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
51608 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
51609 return Xor1;
51610}
51611
51612/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
51613/// equivalent `((x ^ y) & m) ^ y)` pattern.
51614/// This is typically a better representation for targets without a fused
51615/// "and-not" operation. This function is intended to be called from a
51616/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
51618 // Note that masked-merge variants using XOR or ADD expressions are
51619 // normalized to OR by InstCombine so we only check for OR.
51620 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
51621 SDValue N0 = Node->getOperand(0);
51622 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
51623 return SDValue();
51624 SDValue N1 = Node->getOperand(1);
51625 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
51626 return SDValue();
51627
51628 SDLoc DL(Node);
51629 SDValue N00 = N0->getOperand(0);
51630 SDValue N01 = N0->getOperand(1);
51631 SDValue N10 = N1->getOperand(0);
51632 SDValue N11 = N1->getOperand(1);
51633 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
51634 return Result;
51635 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
51636 return Result;
51637 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
51638 return Result;
51639 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
51640 return Result;
51641 return SDValue();
51642}
51643
51644/// If this is an add or subtract where one operand is produced by a cmp+setcc,
51645/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51646/// with CMP+{ADC, SBB}.
51647/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
51648static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
51649 SDValue X, SDValue Y,
51650 SelectionDAG &DAG,
51651 bool ZeroSecondOpOnly = false) {
51652 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
51653 return SDValue();
51654
51655 // Look through a one-use zext.
51656 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
51657 Y = Y.getOperand(0);
51658
51660 SDValue EFLAGS;
51661 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
51662 CC = (X86::CondCode)Y.getConstantOperandVal(0);
51663 EFLAGS = Y.getOperand(1);
51664 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
51665 Y.hasOneUse()) {
51666 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
51667 }
51668
51669 if (!EFLAGS)
51670 return SDValue();
51671
51672 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51673 // the general case below.
51674 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
51675 if (ConstantX && !ZeroSecondOpOnly) {
51676 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
51677 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
51678 // This is a complicated way to get -1 or 0 from the carry flag:
51679 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51680 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51681 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51682 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51683 EFLAGS);
51684 }
51685
51686 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
51687 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
51688 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
51689 EFLAGS.getValueType().isInteger() &&
51690 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51691 // Swap the operands of a SUB, and we have the same pattern as above.
51692 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
51693 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
51694 SDValue NewSub = DAG.getNode(
51695 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51696 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51697 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
51698 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51699 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51700 NewEFLAGS);
51701 }
51702 }
51703 }
51704
51705 if (CC == X86::COND_B) {
51706 // X + SETB Z --> adc X, 0
51707 // X - SETB Z --> sbb X, 0
51708 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51709 DAG.getVTList(VT, MVT::i32), X,
51710 DAG.getConstant(0, DL, VT), EFLAGS);
51711 }
51712
51713 if (ZeroSecondOpOnly)
51714 return SDValue();
51715
51716 if (CC == X86::COND_A) {
51717 // Try to convert COND_A into COND_B in an attempt to facilitate
51718 // materializing "setb reg".
51719 //
51720 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
51721 // cannot take an immediate as its first operand.
51722 //
51723 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51724 EFLAGS.getValueType().isInteger() &&
51725 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51726 SDValue NewSub =
51727 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51728 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51729 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51730 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51731 DAG.getVTList(VT, MVT::i32), X,
51732 DAG.getConstant(0, DL, VT), NewEFLAGS);
51733 }
51734 }
51735
51736 if (CC == X86::COND_AE) {
51737 // X + SETAE --> sbb X, -1
51738 // X - SETAE --> adc X, -1
51739 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51740 DAG.getVTList(VT, MVT::i32), X,
51741 DAG.getAllOnesConstant(DL, VT), EFLAGS);
51742 }
51743
51744 if (CC == X86::COND_BE) {
51745 // X + SETBE --> sbb X, -1
51746 // X - SETBE --> adc X, -1
51747 // Try to convert COND_BE into COND_AE in an attempt to facilitate
51748 // materializing "setae reg".
51749 //
51750 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
51751 // cannot take an immediate as its first operand.
51752 //
51753 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51754 EFLAGS.getValueType().isInteger() &&
51755 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51756 SDValue NewSub =
51757 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51758 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51759 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51760 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51761 DAG.getVTList(VT, MVT::i32), X,
51762 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
51763 }
51764 }
51765
51766 if (CC != X86::COND_E && CC != X86::COND_NE)
51767 return SDValue();
51768
51769 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
51770 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
51771 !EFLAGS.getOperand(0).getValueType().isInteger())
51772 return SDValue();
51773
51774 SDValue Z = EFLAGS.getOperand(0);
51775 EVT ZVT = Z.getValueType();
51776
51777 // If X is -1 or 0, then we have an opportunity to avoid constants required in
51778 // the general case below.
51779 if (ConstantX) {
51780 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
51781 // fake operands:
51782 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
51783 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
51784 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
51785 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
51786 SDValue Zero = DAG.getConstant(0, DL, ZVT);
51787 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51788 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
51789 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51790 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51791 SDValue(Neg.getNode(), 1));
51792 }
51793
51794 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
51795 // with fake operands:
51796 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
51797 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
51798 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
51799 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
51800 SDValue One = DAG.getConstant(1, DL, ZVT);
51801 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51802 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51803 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51804 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51805 Cmp1.getValue(1));
51806 }
51807 }
51808
51809 // (cmp Z, 1) sets the carry flag if Z is 0.
51810 SDValue One = DAG.getConstant(1, DL, ZVT);
51811 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51812 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51813
51814 // Add the flags type for ADC/SBB nodes.
51815 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
51816
51817 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
51818 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
51819 if (CC == X86::COND_NE)
51820 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
51821 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
51822
51823 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
51824 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
51825 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
51826 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
51827}
51828
51829/// If this is an add or subtract where one operand is produced by a cmp+setcc,
51830/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51831/// with CMP+{ADC, SBB}.
51833 SelectionDAG &DAG) {
51834 bool IsSub = N->getOpcode() == ISD::SUB;
51835 SDValue X = N->getOperand(0);
51836 SDValue Y = N->getOperand(1);
51837 EVT VT = N->getValueType(0);
51838
51839 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
51840 return ADCOrSBB;
51841
51842 // Commute and try again (negate the result for subtracts).
51843 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
51844 if (IsSub)
51845 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
51846 return ADCOrSBB;
51847 }
51848
51849 return SDValue();
51850}
51851
51852static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
51853 SDValue N0, SDValue N1,
51854 SelectionDAG &DAG) {
51855 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
51856
51857 // Delegate to combineAddOrSubToADCOrSBB if we have:
51858 //
51859 // (xor/or (zero_extend (setcc)) imm)
51860 //
51861 // where imm is odd if and only if we have xor, in which case the XOR/OR are
51862 // equivalent to a SUB/ADD, respectively.
51863 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
51864 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
51865 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
51866 bool IsSub = Opc == ISD::XOR;
51867 bool N1COdd = N1C->getZExtValue() & 1;
51868 if (IsSub ? N1COdd : !N1COdd)
51869 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
51870 return R;
51871 }
51872 }
51873
51874 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
51875 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
51876 N0.getOperand(0).getOpcode() == ISD::AND &&
51880 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
51881 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
51882 N0.getOperand(0).getOperand(1));
51883 }
51884
51885 return SDValue();
51886}
51887
51890 const X86Subtarget &Subtarget) {
51891 SDValue N0 = N->getOperand(0);
51892 SDValue N1 = N->getOperand(1);
51893 EVT VT = N->getValueType(0);
51894 SDLoc dl(N);
51895 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51896
51897 // If this is SSE1 only convert to FOR to avoid scalarization.
51898 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51899 return DAG.getBitcast(MVT::v4i32,
51900 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
51901 DAG.getBitcast(MVT::v4f32, N0),
51902 DAG.getBitcast(MVT::v4f32, N1)));
51903 }
51904
51905 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51906 // TODO: Support multiple SrcOps.
51907 if (VT == MVT::i1) {
51909 SmallVector<APInt, 2> SrcPartials;
51910 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
51911 SrcOps.size() == 1) {
51912 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51913 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51914 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51915 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51916 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51917 if (Mask) {
51918 assert(SrcPartials[0].getBitWidth() == NumElts &&
51919 "Unexpected partial reduction mask");
51920 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
51921 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51922 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51923 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
51924 }
51925 }
51926 }
51927
51928 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51929 return SetCC;
51930
51931 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51932 return R;
51933
51934 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51935 return R;
51936
51937 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51938 return R;
51939
51940 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51941 DAG, DCI, Subtarget))
51942 return FPLogic;
51943
51944 if (DCI.isBeforeLegalizeOps())
51945 return SDValue();
51946
51947 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51948 return R;
51949
51950 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
51951 return R;
51952
51953 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
51954 return R;
51955
51956 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
51957 if ((VT == MVT::i32 || VT == MVT::i64) &&
51958 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
51959 isNullConstant(N0.getOperand(0))) {
51960 SDValue Cond = N0.getOperand(1);
51961 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
51962 Cond = Cond.getOperand(0);
51963
51964 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
51965 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
51966 uint64_t Val = CN->getZExtValue();
51967 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
51968 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
51969 CCode = X86::GetOppositeBranchCondition(CCode);
51970 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
51971
51972 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
51973 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
51974 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
51975 return R;
51976 }
51977 }
51978 }
51979 }
51980
51981 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
51982 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
51983 // iff the upper elements of the non-shifted arg are zero.
51984 // KUNPCK require 16+ bool vector elements.
51985 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
51986 unsigned NumElts = VT.getVectorNumElements();
51987 unsigned HalfElts = NumElts / 2;
51988 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
51989 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
51990 N1.getConstantOperandAPInt(1) == HalfElts &&
51991 DAG.MaskedVectorIsZero(N0, UpperElts)) {
51992 return DAG.getNode(
51993 ISD::CONCAT_VECTORS, dl, VT,
51994 extractSubVector(N0, 0, DAG, dl, HalfElts),
51995 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
51996 }
51997 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
51998 N0.getConstantOperandAPInt(1) == HalfElts &&
51999 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52000 return DAG.getNode(
52001 ISD::CONCAT_VECTORS, dl, VT,
52002 extractSubVector(N1, 0, DAG, dl, HalfElts),
52003 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52004 }
52005 }
52006
52007 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52008 // Attempt to recursively combine an OR of shuffles.
52009 SDValue Op(N, 0);
52010 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52011 return Res;
52012
52013 // If either operand is a constant mask, then only the elements that aren't
52014 // allones are actually demanded by the other operand.
52015 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52016 APInt UndefElts;
52017 SmallVector<APInt> EltBits;
52018 int NumElts = VT.getVectorNumElements();
52019 int EltSizeInBits = VT.getScalarSizeInBits();
52020 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52021 return false;
52022
52023 APInt DemandedElts = APInt::getZero(NumElts);
52024 for (int I = 0; I != NumElts; ++I)
52025 if (!EltBits[I].isAllOnes())
52026 DemandedElts.setBit(I);
52027
52028 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52029 };
52030 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52031 if (N->getOpcode() != ISD::DELETED_NODE)
52032 DCI.AddToWorklist(N);
52033 return SDValue(N, 0);
52034 }
52035 }
52036
52037 // We should fold "masked merge" patterns when `andn` is not available.
52038 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
52039 if (SDValue R = foldMaskedMerge(N, DAG))
52040 return R;
52041
52042 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52043 return R;
52044
52045 return SDValue();
52046}
52047
52048/// Try to turn tests against the signbit in the form of:
52049/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52050/// into:
52051/// SETGT(X, -1)
52053 // This is only worth doing if the output type is i8 or i1.
52054 EVT ResultType = N->getValueType(0);
52055 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52056 return SDValue();
52057
52058 SDValue N0 = N->getOperand(0);
52059 SDValue N1 = N->getOperand(1);
52060
52061 // We should be performing an xor against a truncated shift.
52062 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52063 return SDValue();
52064
52065 // Make sure we are performing an xor against one.
52066 if (!isOneConstant(N1))
52067 return SDValue();
52068
52069 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52070 SDValue Shift = N0.getOperand(0);
52071 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52072 return SDValue();
52073
52074 // Make sure we are truncating from one of i16, i32 or i64.
52075 EVT ShiftTy = Shift.getValueType();
52076 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52077 return SDValue();
52078
52079 // Make sure the shift amount extracts the sign bit.
52080 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52081 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52082 return SDValue();
52083
52084 // Create a greater-than comparison against -1.
52085 // N.B. Using SETGE against 0 works but we want a canonical looking
52086 // comparison, using SETGT matches up with what TranslateX86CC.
52087 SDLoc DL(N);
52088 SDValue ShiftOp = Shift.getOperand(0);
52089 EVT ShiftOpTy = ShiftOp.getValueType();
52090 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52091 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52092 *DAG.getContext(), ResultType);
52093 SDValue Cond =
52094 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52095 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52096 if (SetCCResultType != ResultType)
52097 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52098 return Cond;
52099}
52100
52101/// Turn vector tests of the signbit in the form of:
52102/// xor (sra X, elt_size(X)-1), -1
52103/// into:
52104/// pcmpgt X, -1
52105///
52106/// This should be called before type legalization because the pattern may not
52107/// persist after that.
52109 const X86Subtarget &Subtarget) {
52110 EVT VT = N->getValueType(0);
52111 if (!VT.isSimple())
52112 return SDValue();
52113
52114 switch (VT.getSimpleVT().SimpleTy) {
52115 // clang-format off
52116 default: return SDValue();
52117 case MVT::v16i8:
52118 case MVT::v8i16:
52119 case MVT::v4i32:
52120 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52121 case MVT::v32i8:
52122 case MVT::v16i16:
52123 case MVT::v8i32:
52124 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52125 // clang-format on
52126 }
52127
52128 // There must be a shift right algebraic before the xor, and the xor must be a
52129 // 'not' operation.
52130 SDValue Shift = N->getOperand(0);
52131 SDValue Ones = N->getOperand(1);
52132 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52134 return SDValue();
52135
52136 // The shift should be smearing the sign bit across each vector element.
52137 auto *ShiftAmt =
52138 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52139 if (!ShiftAmt ||
52140 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52141 return SDValue();
52142
52143 // Create a greater-than comparison against -1. We don't use the more obvious
52144 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52145 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52146}
52147
52148/// Detect patterns of truncation with unsigned saturation:
52149///
52150/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52151/// Return the source value x to be truncated or SDValue() if the pattern was
52152/// not matched.
52153///
52154/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52155/// where C1 >= 0 and C2 is unsigned max of destination type.
52156///
52157/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52158/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52159///
52160/// These two patterns are equivalent to:
52161/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52162/// So return the smax(x, C1) value to be truncated or SDValue() if the
52163/// pattern was not matched.
52165 const SDLoc &DL) {
52166 using namespace llvm::SDPatternMatch;
52167 EVT InVT = In.getValueType();
52168
52169 // Saturation with truncation. We truncate from InVT to VT.
52171 "Unexpected types for truncate operation");
52172
52173 APInt C1, C2;
52175
52176 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52177 // the element size of the destination type.
52178 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52179 C2.isMask(VT.getScalarSizeInBits()))
52180 return UMin;
52181
52182 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52183 sd_match(SMin, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52184 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52185 return SMin;
52186
52187 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52188 sd_match(SMax, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52189 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52190 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52191
52192 return SDValue();
52193}
52194
52195/// Detect patterns of truncation with signed saturation:
52196/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52197/// signed_max_of_dest_type)) to dest_type)
52198/// or:
52199/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52200/// signed_min_of_dest_type)) to dest_type).
52201/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52202/// Return the source value to be truncated or SDValue() if the pattern was not
52203/// matched.
52204static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52205 using namespace llvm::SDPatternMatch;
52206 unsigned NumDstBits = VT.getScalarSizeInBits();
52207 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52208 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52209
52210 APInt SignedMax, SignedMin;
52211 if (MatchPackUS) {
52212 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52213 SignedMin = APInt::getZero(NumSrcBits);
52214 } else {
52215 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52216 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52217 }
52218
52219 SDValue SMin, SMax;
52220 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52221 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52222 return SMax;
52223
52224 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52225 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52226 return SMin;
52227
52228 return SDValue();
52229}
52230
52232 SelectionDAG &DAG,
52233 const X86Subtarget &Subtarget) {
52234 if (!Subtarget.hasSSE2() || !VT.isVector())
52235 return SDValue();
52236
52237 EVT SVT = VT.getVectorElementType();
52238 EVT InVT = In.getValueType();
52239 EVT InSVT = InVT.getVectorElementType();
52240
52241 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52242 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52243 // and concatenate at the same time. Then we can use a final vpmovuswb to
52244 // clip to 0-255.
52245 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52246 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52247 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52248 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52249 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52250 DL, DAG, Subtarget);
52251 assert(Mid && "Failed to pack!");
52252 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52253 }
52254 }
52255
52256 // vXi32 truncate instructions are available with AVX512F.
52257 // vXi16 truncate instructions are only available with AVX512BW.
52258 // For 256-bit or smaller vectors, we require VLX.
52259 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52260 // If the result type is 256-bits or larger and we have disable 512-bit
52261 // registers, we should go ahead and use the pack instructions if possible.
52262 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52263 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52264 (InVT.getSizeInBits() > 128) &&
52265 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52266 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52267
52268 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52270 (SVT == MVT::i8 || SVT == MVT::i16) &&
52271 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52272 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52273 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52274 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52275 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52276 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52277 DAG, Subtarget);
52278 assert(Mid && "Failed to pack!");
52280 Subtarget);
52281 assert(V && "Failed to pack!");
52282 return V;
52283 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52284 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52285 Subtarget);
52286 }
52287 if (SDValue SSatVal = detectSSatPattern(In, VT))
52288 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52289 Subtarget);
52290 }
52291
52292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52293 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52294 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52295 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52296 unsigned TruncOpc = 0;
52297 SDValue SatVal;
52298 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52299 SatVal = SSatVal;
52300 TruncOpc = X86ISD::VTRUNCS;
52301 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52302 SatVal = USatVal;
52303 TruncOpc = X86ISD::VTRUNCUS;
52304 }
52305 if (SatVal) {
52306 unsigned ResElts = VT.getVectorNumElements();
52307 // If the input type is less than 512 bits and we don't have VLX, we need
52308 // to widen to 512 bits.
52309 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52310 unsigned NumConcats = 512 / InVT.getSizeInBits();
52311 ResElts *= NumConcats;
52312 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52313 ConcatOps[0] = SatVal;
52314 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52315 NumConcats * InVT.getVectorNumElements());
52316 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52317 }
52318 // Widen the result if its narrower than 128 bits.
52319 if (ResElts * SVT.getSizeInBits() < 128)
52320 ResElts = 128 / SVT.getSizeInBits();
52321 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52322 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52323 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52324 DAG.getVectorIdxConstant(0, DL));
52325 }
52326 }
52327
52328 return SDValue();
52329}
52330
52332 SelectionDAG &DAG,
52334 const X86Subtarget &Subtarget) {
52335 auto *Ld = cast<LoadSDNode>(N);
52336 EVT RegVT = Ld->getValueType(0);
52337 SDValue Ptr = Ld->getBasePtr();
52338 SDValue Chain = Ld->getChain();
52339 ISD::LoadExtType Ext = Ld->getExtensionType();
52340
52341 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52342 return SDValue();
52343
52344 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52345 return SDValue();
52346
52348 if (!LdC)
52349 return SDValue();
52350
52351 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
52352 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
52353 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
52354 if (Undefs[I])
52355 continue;
52356 if (UserUndefs[I] || Bits[I] != UserBits[I])
52357 return false;
52358 }
52359 return true;
52360 };
52361
52362 // Look through all other loads/broadcasts in the chain for another constant
52363 // pool entry.
52364 for (SDNode *User : Chain->users()) {
52365 auto *UserLd = dyn_cast<MemSDNode>(User);
52366 if (User != N && UserLd &&
52367 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52368 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52370 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
52371 User->getValueSizeInBits(0).getFixedValue() >
52372 RegVT.getFixedSizeInBits()) {
52373 EVT UserVT = User->getValueType(0);
52374 SDValue UserPtr = UserLd->getBasePtr();
52375 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
52376
52377 // See if we are loading a constant that matches in the lower
52378 // bits of a longer constant (but from a different constant pool ptr).
52379 if (UserC && UserPtr != Ptr) {
52380 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52381 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
52382 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
52383 APInt Undefs, UserUndefs;
52384 SmallVector<APInt> Bits, UserBits;
52385 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
52386 UserVT.getScalarSizeInBits());
52387 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
52388 Bits) &&
52390 UserUndefs, UserBits)) {
52391 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
52392 SDValue Extract = extractSubVector(
52393 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
52394 Extract = DAG.getBitcast(RegVT, Extract);
52395 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52396 }
52397 }
52398 }
52399 }
52400 }
52401 }
52402
52403 return SDValue();
52404}
52405
52408 const X86Subtarget &Subtarget) {
52409 auto *Ld = cast<LoadSDNode>(N);
52410 EVT RegVT = Ld->getValueType(0);
52411 EVT MemVT = Ld->getMemoryVT();
52412 SDLoc dl(Ld);
52413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52414
52415 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
52416 // into two 16-byte operations. Also split non-temporal aligned loads on
52417 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
52418 ISD::LoadExtType Ext = Ld->getExtensionType();
52419 unsigned Fast;
52420 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
52421 Ext == ISD::NON_EXTLOAD &&
52422 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
52423 Ld->getAlign() >= Align(16)) ||
52424 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
52425 *Ld->getMemOperand(), &Fast) &&
52426 !Fast))) {
52427 unsigned NumElems = RegVT.getVectorNumElements();
52428 if (NumElems < 2)
52429 return SDValue();
52430
52431 unsigned HalfOffset = 16;
52432 SDValue Ptr1 = Ld->getBasePtr();
52433 SDValue Ptr2 =
52434 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
52435 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
52436 NumElems / 2);
52437 SDValue Load1 =
52438 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
52439 Ld->getOriginalAlign(),
52440 Ld->getMemOperand()->getFlags());
52441 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
52442 Ld->getPointerInfo().getWithOffset(HalfOffset),
52443 Ld->getOriginalAlign(),
52444 Ld->getMemOperand()->getFlags());
52445 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
52446 Load1.getValue(1), Load2.getValue(1));
52447
52448 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
52449 return DCI.CombineTo(N, NewVec, TF, true);
52450 }
52451
52452 // Bool vector load - attempt to cast to an integer, as we have good
52453 // (vXiY *ext(vXi1 bitcast(iX))) handling.
52454 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
52455 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
52456 unsigned NumElts = RegVT.getVectorNumElements();
52457 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52458 if (TLI.isTypeLegal(IntVT)) {
52459 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
52460 Ld->getPointerInfo(),
52461 Ld->getOriginalAlign(),
52462 Ld->getMemOperand()->getFlags());
52463 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
52464 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
52465 }
52466 }
52467
52468 // If we also broadcast this vector to a wider type, then just extract the
52469 // lowest subvector.
52470 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
52471 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
52472 SDValue Ptr = Ld->getBasePtr();
52473 SDValue Chain = Ld->getChain();
52474 for (SDNode *User : Chain->users()) {
52475 auto *UserLd = dyn_cast<MemSDNode>(User);
52476 if (User != N && UserLd &&
52477 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
52478 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
52479 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
52480 !User->hasAnyUseOfValue(1) &&
52481 User->getValueSizeInBits(0).getFixedValue() >
52482 RegVT.getFixedSizeInBits()) {
52483 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
52484 RegVT.getSizeInBits());
52485 Extract = DAG.getBitcast(RegVT, Extract);
52486 return DCI.CombineTo(N, Extract, SDValue(User, 1));
52487 }
52488 }
52489 }
52490
52491 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
52492 return V;
52493
52494 // Cast ptr32 and ptr64 pointers to the default address space before a load.
52495 unsigned AddrSpace = Ld->getAddressSpace();
52496 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
52497 AddrSpace == X86AS::PTR32_UPTR) {
52498 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
52499 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
52500 SDValue Cast =
52501 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
52502 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
52503 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
52504 Ld->getMemOperand()->getFlags());
52505 }
52506 }
52507
52508 return SDValue();
52509}
52510
52511/// If V is a build vector of boolean constants and exactly one of those
52512/// constants is true, return the operand index of that true element.
52513/// Otherwise, return -1.
52514static int getOneTrueElt(SDValue V) {
52515 // This needs to be a build vector of booleans.
52516 // TODO: Checking for the i1 type matches the IR definition for the mask,
52517 // but the mask check could be loosened to i8 or other types. That might
52518 // also require checking more than 'allOnesValue'; eg, the x86 HW
52519 // instructions only require that the MSB is set for each mask element.
52520 // The ISD::MSTORE comments/definition do not specify how the mask operand
52521 // is formatted.
52522 auto *BV = dyn_cast<BuildVectorSDNode>(V);
52523 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
52524 return -1;
52525
52526 int TrueIndex = -1;
52527 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
52528 for (unsigned i = 0; i < NumElts; ++i) {
52529 const SDValue &Op = BV->getOperand(i);
52530 if (Op.isUndef())
52531 continue;
52532 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
52533 if (!ConstNode)
52534 return -1;
52535 if (ConstNode->getAPIntValue().countr_one() >= 1) {
52536 // If we already found a one, this is too many.
52537 if (TrueIndex >= 0)
52538 return -1;
52539 TrueIndex = i;
52540 }
52541 }
52542 return TrueIndex;
52543}
52544
52545/// Given a masked memory load/store operation, return true if it has one mask
52546/// bit set. If it has one mask bit set, then also return the memory address of
52547/// the scalar element to load/store, the vector index to insert/extract that
52548/// scalar element, and the alignment for the scalar memory access.
52550 SelectionDAG &DAG, SDValue &Addr,
52551 SDValue &Index, Align &Alignment,
52552 unsigned &Offset) {
52553 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
52554 if (TrueMaskElt < 0)
52555 return false;
52556
52557 // Get the address of the one scalar element that is specified by the mask
52558 // using the appropriate offset from the base pointer.
52559 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
52560 Offset = 0;
52561 Addr = MaskedOp->getBasePtr();
52562 if (TrueMaskElt != 0) {
52563 Offset = TrueMaskElt * EltVT.getStoreSize();
52565 SDLoc(MaskedOp));
52566 }
52567
52568 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
52569 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
52570 EltVT.getStoreSize());
52571 return true;
52572}
52573
52574/// If exactly one element of the mask is set for a non-extending masked load,
52575/// it is a scalar load and vector insert.
52576/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52577/// mask have already been optimized in IR, so we don't bother with those here.
52578static SDValue
52581 const X86Subtarget &Subtarget) {
52582 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52583 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52584 // However, some target hooks may need to be added to know when the transform
52585 // is profitable. Endianness would also have to be considered.
52586
52587 SDValue Addr, VecIndex;
52588 Align Alignment;
52589 unsigned Offset;
52590 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
52591 return SDValue();
52592
52593 // Load the one scalar element that is specified by the mask using the
52594 // appropriate offset from the base pointer.
52595 SDLoc DL(ML);
52596 EVT VT = ML->getValueType(0);
52597 EVT EltVT = VT.getVectorElementType();
52598
52599 EVT CastVT = VT;
52600 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52601 EltVT = MVT::f64;
52602 CastVT = VT.changeVectorElementType(EltVT);
52603 }
52604
52605 SDValue Load =
52606 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
52607 ML->getPointerInfo().getWithOffset(Offset),
52608 Alignment, ML->getMemOperand()->getFlags());
52609
52610 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
52611
52612 // Insert the loaded element into the appropriate place in the vector.
52613 SDValue Insert =
52614 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
52615 Insert = DAG.getBitcast(VT, Insert);
52616 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
52617}
52618
52619static SDValue
52622 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52623 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
52624 return SDValue();
52625
52626 SDLoc DL(ML);
52627 EVT VT = ML->getValueType(0);
52628
52629 // If we are loading the first and last elements of a vector, it is safe and
52630 // always faster to load the whole vector. Replace the masked load with a
52631 // vector load and select.
52632 unsigned NumElts = VT.getVectorNumElements();
52633 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
52634 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
52635 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
52636 if (LoadFirstElt && LoadLastElt) {
52637 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
52638 ML->getMemOperand());
52639 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
52640 ML->getPassThru());
52641 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
52642 }
52643
52644 // Convert a masked load with a constant mask into a masked load and a select.
52645 // This allows the select operation to use a faster kind of select instruction
52646 // (for example, vblendvps -> vblendps).
52647
52648 // Don't try this if the pass-through operand is already undefined. That would
52649 // cause an infinite loop because that's what we're about to create.
52650 if (ML->getPassThru().isUndef())
52651 return SDValue();
52652
52653 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
52654 return SDValue();
52655
52656 // The new masked load has an undef pass-through operand. The select uses the
52657 // original pass-through operand.
52658 SDValue NewML = DAG.getMaskedLoad(
52659 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
52660 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
52661 ML->getAddressingMode(), ML->getExtensionType());
52662 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
52663 ML->getPassThru());
52664
52665 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
52666}
52667
52670 const X86Subtarget &Subtarget) {
52671 auto *Mld = cast<MaskedLoadSDNode>(N);
52672
52673 // TODO: Expanding load with constant mask may be optimized as well.
52674 if (Mld->isExpandingLoad())
52675 return SDValue();
52676
52677 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
52678 if (SDValue ScalarLoad =
52679 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
52680 return ScalarLoad;
52681
52682 // TODO: Do some AVX512 subsets benefit from this transform?
52683 if (!Subtarget.hasAVX512())
52684 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
52685 return Blend;
52686 }
52687
52688 // If the mask value has been legalized to a non-boolean vector, try to
52689 // simplify ops leading up to it. We only demand the MSB of each lane.
52690 SDValue Mask = Mld->getMask();
52691 if (Mask.getScalarValueSizeInBits() != 1) {
52692 EVT VT = Mld->getValueType(0);
52693 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52695 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52696 if (N->getOpcode() != ISD::DELETED_NODE)
52697 DCI.AddToWorklist(N);
52698 return SDValue(N, 0);
52699 }
52700 if (SDValue NewMask =
52702 return DAG.getMaskedLoad(
52703 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
52704 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
52705 Mld->getAddressingMode(), Mld->getExtensionType());
52706 }
52707
52708 return SDValue();
52709}
52710
52711/// If exactly one element of the mask is set for a non-truncating masked store,
52712/// it is a vector extract and scalar store.
52713/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52714/// mask have already been optimized in IR, so we don't bother with those here.
52716 SelectionDAG &DAG,
52717 const X86Subtarget &Subtarget) {
52718 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52719 // However, some target hooks may need to be added to know when the transform
52720 // is profitable. Endianness would also have to be considered.
52721
52722 SDValue Addr, VecIndex;
52723 Align Alignment;
52724 unsigned Offset;
52725 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
52726 return SDValue();
52727
52728 // Extract the one scalar element that is actually being stored.
52729 SDLoc DL(MS);
52730 SDValue Value = MS->getValue();
52731 EVT VT = Value.getValueType();
52732 EVT EltVT = VT.getVectorElementType();
52733 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52734 EltVT = MVT::f64;
52735 EVT CastVT = VT.changeVectorElementType(EltVT);
52736 Value = DAG.getBitcast(CastVT, Value);
52737 }
52738 SDValue Extract =
52739 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
52740
52741 // Store that element at the appropriate offset from the base pointer.
52742 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
52744 Alignment, MS->getMemOperand()->getFlags());
52745}
52746
52749 const X86Subtarget &Subtarget) {
52750 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
52751 if (Mst->isCompressingStore())
52752 return SDValue();
52753
52754 EVT VT = Mst->getValue().getValueType();
52755 SDLoc dl(Mst);
52756 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52757
52758 if (Mst->isTruncatingStore())
52759 return SDValue();
52760
52761 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
52762 return ScalarStore;
52763
52764 // If the mask value has been legalized to a non-boolean vector, try to
52765 // simplify ops leading up to it. We only demand the MSB of each lane.
52766 SDValue Mask = Mst->getMask();
52767 if (Mask.getScalarValueSizeInBits() != 1) {
52769 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52770 if (N->getOpcode() != ISD::DELETED_NODE)
52771 DCI.AddToWorklist(N);
52772 return SDValue(N, 0);
52773 }
52774 if (SDValue NewMask =
52776 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
52777 Mst->getBasePtr(), Mst->getOffset(), NewMask,
52778 Mst->getMemoryVT(), Mst->getMemOperand(),
52779 Mst->getAddressingMode());
52780 }
52781
52782 SDValue Value = Mst->getValue();
52783 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
52784 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
52785 Mst->getMemoryVT())) {
52786 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
52787 Mst->getBasePtr(), Mst->getOffset(), Mask,
52788 Mst->getMemoryVT(), Mst->getMemOperand(),
52789 Mst->getAddressingMode(), true);
52790 }
52791
52792 return SDValue();
52793}
52794
52797 const X86Subtarget &Subtarget) {
52798 StoreSDNode *St = cast<StoreSDNode>(N);
52799 EVT StVT = St->getMemoryVT();
52800 SDLoc dl(St);
52801 SDValue StoredVal = St->getValue();
52802 EVT VT = StoredVal.getValueType();
52803 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52804
52805 // Convert a store of vXi1 into a store of iX and a bitcast.
52806 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
52807 VT.getVectorElementType() == MVT::i1) {
52808
52810 StoredVal = DAG.getBitcast(NewVT, StoredVal);
52811
52812 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52813 St->getPointerInfo(), St->getOriginalAlign(),
52814 St->getMemOperand()->getFlags());
52815 }
52816
52817 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
52818 // This will avoid a copy to k-register.
52819 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
52820 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
52821 StoredVal.getOperand(0).getValueType() == MVT::i8) {
52822 SDValue Val = StoredVal.getOperand(0);
52823 // We must store zeros to the unused bits.
52824 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
52825 return DAG.getStore(St->getChain(), dl, Val,
52826 St->getBasePtr(), St->getPointerInfo(),
52827 St->getOriginalAlign(),
52828 St->getMemOperand()->getFlags());
52829 }
52830
52831 // Widen v2i1/v4i1 stores to v8i1.
52832 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
52833 Subtarget.hasAVX512()) {
52834 unsigned NumConcats = 8 / VT.getVectorNumElements();
52835 // We must store zeros to the unused bits.
52836 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
52837 Ops[0] = StoredVal;
52838 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
52839 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52840 St->getPointerInfo(), St->getOriginalAlign(),
52841 St->getMemOperand()->getFlags());
52842 }
52843
52844 // Turn vXi1 stores of constants into a scalar store.
52845 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
52846 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
52848 // If its a v64i1 store without 64-bit support, we need two stores.
52849 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
52850 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
52851 StoredVal->ops().slice(0, 32));
52853 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
52854 StoredVal->ops().slice(32, 32));
52856
52857 SDValue Ptr0 = St->getBasePtr();
52858 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
52859
52860 SDValue Ch0 =
52861 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52862 St->getOriginalAlign(),
52863 St->getMemOperand()->getFlags());
52864 SDValue Ch1 =
52865 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52867 St->getOriginalAlign(),
52868 St->getMemOperand()->getFlags());
52869 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
52870 }
52871
52872 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
52873 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52874 St->getPointerInfo(), St->getOriginalAlign(),
52875 St->getMemOperand()->getFlags());
52876 }
52877
52878 // Convert scalar fabs/fneg load-store to integer equivalents.
52879 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
52880 (StoredVal.getOpcode() == ISD::FABS ||
52881 StoredVal.getOpcode() == ISD::FNEG) &&
52882 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
52883 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
52884 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
52885 if (TLI.isTypeLegal(IntVT)) {
52887 unsigned SignOp = ISD::XOR;
52888 if (StoredVal.getOpcode() == ISD::FABS) {
52889 SignMask = ~SignMask;
52890 SignOp = ISD::AND;
52891 }
52892 SDValue LogicOp = DAG.getNode(
52893 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
52894 DAG.getConstant(SignMask, dl, IntVT));
52895 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
52896 St->getPointerInfo(), St->getOriginalAlign(),
52897 St->getMemOperand()->getFlags());
52898 }
52899 }
52900
52901 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52902 // Sandy Bridge, perform two 16-byte stores.
52903 unsigned Fast;
52904 if (VT.is256BitVector() && StVT == VT &&
52905 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
52906 *St->getMemOperand(), &Fast) &&
52907 !Fast) {
52908 unsigned NumElems = VT.getVectorNumElements();
52909 if (NumElems < 2)
52910 return SDValue();
52911
52912 return splitVectorStore(St, DAG);
52913 }
52914
52915 // Split under-aligned vector non-temporal stores.
52916 if (St->isNonTemporal() && StVT == VT &&
52917 St->getAlign().value() < VT.getStoreSize()) {
52918 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52919 // vectors or the legalizer can scalarize it to use MOVNTI.
52920 if (VT.is256BitVector() || VT.is512BitVector()) {
52921 unsigned NumElems = VT.getVectorNumElements();
52922 if (NumElems < 2)
52923 return SDValue();
52924 return splitVectorStore(St, DAG);
52925 }
52926
52927 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52928 // to use MOVNTI.
52929 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
52930 MVT NTVT = Subtarget.hasSSE4A()
52931 ? MVT::v2f64
52932 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
52933 return scalarizeVectorStore(St, NTVT, DAG);
52934 }
52935 }
52936
52937 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52938 // supported, but avx512f is by extending to v16i32 and truncating.
52939 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52940 St->getValue().getOpcode() == ISD::TRUNCATE &&
52941 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
52942 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
52943 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
52944 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
52945 St->getValue().getOperand(0));
52946 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
52947 MVT::v16i8, St->getMemOperand());
52948 }
52949
52950 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
52951 if (!St->isTruncatingStore() &&
52952 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
52953 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
52954 StoredVal.hasOneUse() &&
52955 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
52956 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
52957 return EmitTruncSStore(IsSigned, St->getChain(),
52958 dl, StoredVal.getOperand(0), St->getBasePtr(),
52959 VT, St->getMemOperand(), DAG);
52960 }
52961
52962 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
52963 if (!St->isTruncatingStore()) {
52964 auto IsExtractedElement = [](SDValue V) {
52965 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
52966 V = V.getOperand(0);
52967 unsigned Opc = V.getOpcode();
52968 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
52969 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
52970 V.getOperand(0).hasOneUse())
52971 return V.getOperand(0);
52972 return SDValue();
52973 };
52974 if (SDValue Extract = IsExtractedElement(StoredVal)) {
52975 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
52976 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
52977 SDValue Src = Trunc.getOperand(0);
52978 MVT DstVT = Trunc.getSimpleValueType();
52979 MVT SrcVT = Src.getSimpleValueType();
52980 unsigned NumSrcElts = SrcVT.getVectorNumElements();
52981 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
52982 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
52983 if (NumTruncBits == VT.getSizeInBits() &&
52984 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
52985 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
52986 TruncVT, St->getMemOperand());
52987 }
52988 }
52989 }
52990 }
52991
52992 // Optimize trunc store (of multiple scalars) to shuffle and store.
52993 // First, pack all of the elements in one place. Next, store to memory
52994 // in fewer chunks.
52995 if (St->isTruncatingStore() && VT.isVector()) {
52996 if (TLI.isTruncStoreLegal(VT, StVT)) {
52997 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
52998 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
52999 dl, Val, St->getBasePtr(),
53000 St->getMemoryVT(), St->getMemOperand(), DAG);
53001 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53002 DAG, dl))
53003 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53004 dl, Val, St->getBasePtr(),
53005 St->getMemoryVT(), St->getMemOperand(), DAG);
53006 }
53007
53008 return SDValue();
53009 }
53010
53011 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53012 unsigned AddrSpace = St->getAddressSpace();
53013 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53014 AddrSpace == X86AS::PTR32_UPTR) {
53015 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53016 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53017 SDValue Cast =
53018 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53019 return DAG.getTruncStore(
53020 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53021 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
53022 St->getAAInfo());
53023 }
53024 }
53025
53026 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53027 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53028 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53029 Subtarget.hasCF() && St->isSimple()) {
53030 SDValue Cmov;
53031 if (StoredVal.getOpcode() == X86ISD::CMOV)
53032 Cmov = StoredVal;
53033 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53034 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53035 Cmov = StoredVal.getOperand(0);
53036 else
53037 return SDValue();
53038
53039 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53040 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53041 return SDValue();
53042
53043 bool InvertCC = false;
53044 SDValue V = SDValue(Ld, 0);
53045 if (V == Cmov.getOperand(1))
53046 InvertCC = true;
53047 else if (V != Cmov.getOperand(0))
53048 return SDValue();
53049
53050 SDVTList Tys = DAG.getVTList(MVT::Other);
53051 SDValue CC = Cmov.getOperand(2);
53052 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53053 if (InvertCC)
53054 CC = DAG.getTargetConstant(
53057 dl, MVT::i8);
53058 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53059 Cmov.getOperand(3)};
53060 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53061 St->getMemOperand());
53062 }
53063
53064 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53065 // the FP state in cases where an emms may be missing.
53066 // A preferable solution to the general problem is to figure out the right
53067 // places to insert EMMS. This qualifies as a quick hack.
53068
53069 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53070 if (VT.getSizeInBits() != 64)
53071 return SDValue();
53072
53073 const Function &F = DAG.getMachineFunction().getFunction();
53074 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53075 bool F64IsLegal =
53076 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53077
53078 if (!F64IsLegal || Subtarget.is64Bit())
53079 return SDValue();
53080
53081 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53082 cast<LoadSDNode>(St->getValue())->isSimple() &&
53083 St->getChain().hasOneUse() && St->isSimple()) {
53084 auto *Ld = cast<LoadSDNode>(St->getValue());
53085
53086 if (!ISD::isNormalLoad(Ld))
53087 return SDValue();
53088
53089 // Avoid the transformation if there are multiple uses of the loaded value.
53090 if (!Ld->hasNUsesOfValue(1, 0))
53091 return SDValue();
53092
53093 SDLoc LdDL(Ld);
53094 SDLoc StDL(N);
53095 // Lower to a single movq load/store pair.
53096 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53097 Ld->getBasePtr(), Ld->getMemOperand());
53098
53099 // Make sure new load is placed in same chain order.
53100 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53101 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53102 St->getMemOperand());
53103 }
53104
53105 // This is similar to the above case, but here we handle a scalar 64-bit
53106 // integer store that is extracted from a vector on a 32-bit target.
53107 // If we have SSE2, then we can treat it like a floating-point double
53108 // to get past legalization. The execution dependencies fixup pass will
53109 // choose the optimal machine instruction for the store if this really is
53110 // an integer or v2f32 rather than an f64.
53111 if (VT == MVT::i64 &&
53113 SDValue OldExtract = St->getOperand(1);
53114 SDValue ExtOp0 = OldExtract.getOperand(0);
53115 unsigned VecSize = ExtOp0.getValueSizeInBits();
53116 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53117 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53118 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53119 BitCast, OldExtract.getOperand(1));
53120 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53121 St->getPointerInfo(), St->getOriginalAlign(),
53122 St->getMemOperand()->getFlags());
53123 }
53124
53125 return SDValue();
53126}
53127
53130 const X86Subtarget &Subtarget) {
53131 auto *St = cast<MemIntrinsicSDNode>(N);
53132
53133 SDValue StoredVal = N->getOperand(1);
53134 MVT VT = StoredVal.getSimpleValueType();
53135 EVT MemVT = St->getMemoryVT();
53136
53137 // Figure out which elements we demand.
53138 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53139 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53140
53141 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53142 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53143 if (N->getOpcode() != ISD::DELETED_NODE)
53144 DCI.AddToWorklist(N);
53145 return SDValue(N, 0);
53146 }
53147
53148 return SDValue();
53149}
53150
53151/// Return 'true' if this vector operation is "horizontal"
53152/// and return the operands for the horizontal operation in LHS and RHS. A
53153/// horizontal operation performs the binary operation on successive elements
53154/// of its first operand, then on successive elements of its second operand,
53155/// returning the resulting values in a vector. For example, if
53156/// A = < float a0, float a1, float a2, float a3 >
53157/// and
53158/// B = < float b0, float b1, float b2, float b3 >
53159/// then the result of doing a horizontal operation on A and B is
53160/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53161/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53162/// A horizontal-op B, for some already available A and B, and if so then LHS is
53163/// set to A, RHS to B, and the routine returns 'true'.
53164static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53165 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53166 bool IsCommutative,
53167 SmallVectorImpl<int> &PostShuffleMask,
53168 bool ForceHorizOp) {
53169 // If either operand is undef, bail out. The binop should be simplified.
53170 if (LHS.isUndef() || RHS.isUndef())
53171 return false;
53172
53173 // Look for the following pattern:
53174 // A = < float a0, float a1, float a2, float a3 >
53175 // B = < float b0, float b1, float b2, float b3 >
53176 // and
53177 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53178 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53179 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53180 // which is A horizontal-op B.
53181
53182 MVT VT = LHS.getSimpleValueType();
53183 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53184 "Unsupported vector type for horizontal add/sub");
53185 unsigned NumElts = VT.getVectorNumElements();
53186
53187 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53188 SmallVectorImpl<int> &ShuffleMask) {
53189 bool UseSubVector = false;
53190 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53191 Op.getOperand(0).getValueType().is256BitVector() &&
53192 llvm::isNullConstant(Op.getOperand(1))) {
53193 Op = Op.getOperand(0);
53194 UseSubVector = true;
53195 }
53197 SmallVector<int, 16> SrcMask, ScaledMask;
53199 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53200 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53201 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53202 })) {
53203 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53204 if (!UseSubVector && SrcOps.size() <= 2 &&
53205 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53206 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53207 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53208 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53209 }
53210 if (UseSubVector && SrcOps.size() == 1 &&
53211 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53212 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53213 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53214 ShuffleMask.assign(Mask.begin(), Mask.end());
53215 }
53216 }
53217 };
53218
53219 // View LHS in the form
53220 // LHS = VECTOR_SHUFFLE A, B, LMask
53221 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53222 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53223 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53224 SDValue A, B;
53226 GetShuffle(LHS, A, B, LMask);
53227
53228 // Likewise, view RHS in the form
53229 // RHS = VECTOR_SHUFFLE C, D, RMask
53230 SDValue C, D;
53232 GetShuffle(RHS, C, D, RMask);
53233
53234 // At least one of the operands should be a vector shuffle.
53235 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53236 if (NumShuffles == 0)
53237 return false;
53238
53239 if (LMask.empty()) {
53240 A = LHS;
53241 for (unsigned i = 0; i != NumElts; ++i)
53242 LMask.push_back(i);
53243 }
53244
53245 if (RMask.empty()) {
53246 C = RHS;
53247 for (unsigned i = 0; i != NumElts; ++i)
53248 RMask.push_back(i);
53249 }
53250
53251 // If we have an unary mask, ensure the other op is set to null.
53252 if (isUndefOrInRange(LMask, 0, NumElts))
53253 B = SDValue();
53254 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53255 A = SDValue();
53256
53257 if (isUndefOrInRange(RMask, 0, NumElts))
53258 D = SDValue();
53259 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53260 C = SDValue();
53261
53262 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53263 // RHS operands and shuffle mask.
53264 if (A != C) {
53265 std::swap(C, D);
53267 }
53268 // Check that the shuffles are both shuffling the same vectors.
53269 if (!(A == C && B == D))
53270 return false;
53271
53272 PostShuffleMask.clear();
53273 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53274
53275 // LHS and RHS are now:
53276 // LHS = shuffle A, B, LMask
53277 // RHS = shuffle A, B, RMask
53278 // Check that the masks correspond to performing a horizontal operation.
53279 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53280 // so we just repeat the inner loop if this is a 256-bit op.
53281 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53282 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53283 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53284 assert((NumEltsPer128BitChunk % 2 == 0) &&
53285 "Vector type should have an even number of elements in each lane");
53286 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53287 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53288 // Ignore undefined components.
53289 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53290 if (LIdx < 0 || RIdx < 0 ||
53291 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53292 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53293 continue;
53294
53295 // Check that successive odd/even elements are being operated on. If not,
53296 // this is not a horizontal operation.
53297 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53298 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53299 return false;
53300
53301 // Compute the post-shuffle mask index based on where the element
53302 // is stored in the HOP result, and where it needs to be moved to.
53303 int Base = LIdx & ~1u;
53304 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53305 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53306
53307 // The low half of the 128-bit result must choose from A.
53308 // The high half of the 128-bit result must choose from B,
53309 // unless B is undef. In that case, we are always choosing from A.
53310 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53311 Index += NumEltsPer64BitChunk;
53312 PostShuffleMask[i + j] = Index;
53313 }
53314 }
53315
53316 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53317 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53318
53319 bool IsIdentityPostShuffle =
53320 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53321 if (IsIdentityPostShuffle)
53322 PostShuffleMask.clear();
53323
53324 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53325 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53326 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53327 return false;
53328
53329 // If the source nodes are already used in HorizOps then always accept this.
53330 // Shuffle folding should merge these back together.
53331 auto FoundHorizUser = [&](SDNode *User) {
53332 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53333 };
53334 ForceHorizOp =
53335 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53336 llvm::any_of(NewRHS->users(), FoundHorizUser));
53337
53338 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53339 // shuffle the result.
53340 if (!ForceHorizOp &&
53341 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53342 (NumShuffles < 2 || !IsIdentityPostShuffle),
53343 DAG, Subtarget))
53344 return false;
53345
53346 LHS = DAG.getBitcast(VT, NewLHS);
53347 RHS = DAG.getBitcast(VT, NewRHS);
53348 return true;
53349}
53350
53351// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
53353 const X86Subtarget &Subtarget) {
53354 EVT VT = N->getValueType(0);
53355 unsigned Opcode = N->getOpcode();
53356 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
53357 SmallVector<int, 8> PostShuffleMask;
53358
53359 auto MergableHorizOp = [N](unsigned HorizOpcode) {
53360 return N->hasOneUse() &&
53361 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53362 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53363 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53364 };
53365
53366 switch (Opcode) {
53367 case ISD::FADD:
53368 case ISD::FSUB:
53369 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
53370 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
53371 SDValue LHS = N->getOperand(0);
53372 SDValue RHS = N->getOperand(1);
53373 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
53374 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53375 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53376 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
53377 if (!PostShuffleMask.empty())
53378 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53379 DAG.getUNDEF(VT), PostShuffleMask);
53380 return HorizBinOp;
53381 }
53382 }
53383 break;
53384 case ISD::ADD:
53385 case ISD::SUB:
53386 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
53387 VT == MVT::v16i16 || VT == MVT::v8i32)) {
53388 SDValue LHS = N->getOperand(0);
53389 SDValue RHS = N->getOperand(1);
53390 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
53391 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53392 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53393 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
53394 ArrayRef<SDValue> Ops) {
53395 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
53396 };
53397 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
53398 {LHS, RHS}, HOpBuilder);
53399 if (!PostShuffleMask.empty())
53400 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53401 DAG.getUNDEF(VT), PostShuffleMask);
53402 return HorizBinOp;
53403 }
53404 }
53405 break;
53406 }
53407
53408 return SDValue();
53409}
53410
53411// Try to combine the following nodes
53412// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
53413// <i32 -2147483648[float -0.000000e+00]> 0
53414// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
53415// <(load 4 from constant-pool)> t0, t29
53416// [t30: v16i32 = bitcast t27]
53417// t6: v16i32 = xor t7, t27[t30]
53418// t11: v16f32 = bitcast t6
53419// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
53420// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
53421// t22: v16f32 = bitcast t7
53422// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
53423// t24: v32f16 = bitcast t23
53425 const X86Subtarget &Subtarget) {
53426 EVT VT = N->getValueType(0);
53427 SDValue LHS = N->getOperand(0);
53428 SDValue RHS = N->getOperand(1);
53429 int CombineOpcode =
53430 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
53431 auto combineConjugation = [&](SDValue &r) {
53432 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
53433 SDValue XOR = LHS.getOperand(0);
53434 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
53435 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
53436 if (XORRHS.isConstant()) {
53437 APInt ConjugationInt32 = APInt(32, 0x80000000);
53438 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
53439 if ((XORRHS.getBitWidth() == 32 &&
53440 XORRHS.getConstant() == ConjugationInt32) ||
53441 (XORRHS.getBitWidth() == 64 &&
53442 XORRHS.getConstant() == ConjugationInt64)) {
53443 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
53444 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
53445 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
53446 r = DAG.getBitcast(VT, FCMulC);
53447 return true;
53448 }
53449 }
53450 }
53451 }
53452 return false;
53453 };
53454 SDValue Res;
53455 if (combineConjugation(Res))
53456 return Res;
53457 std::swap(LHS, RHS);
53458 if (combineConjugation(Res))
53459 return Res;
53460 return Res;
53461}
53462
53463// Try to combine the following nodes:
53464// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
53466 const X86Subtarget &Subtarget) {
53467 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
53469 Flags.hasAllowContract();
53470 };
53471
53472 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
53473 return DAG.getTarget().Options.NoSignedZerosFPMath ||
53474 Flags.hasNoSignedZeros();
53475 };
53476 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
53477 APInt AI = APInt(32, 0x80008000);
53478 KnownBits Bits = DAG.computeKnownBits(Op);
53479 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
53480 Bits.getConstant() == AI;
53481 };
53482
53483 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
53484 !AllowContract(N->getFlags()))
53485 return SDValue();
53486
53487 EVT VT = N->getValueType(0);
53488 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
53489 return SDValue();
53490
53491 SDValue LHS = N->getOperand(0);
53492 SDValue RHS = N->getOperand(1);
53493 bool IsConj;
53494 SDValue FAddOp1, MulOp0, MulOp1;
53495 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
53496 &IsVectorAllNegativeZero,
53497 &HasNoSignedZero](SDValue N) -> bool {
53498 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
53499 return false;
53500 SDValue Op0 = N.getOperand(0);
53501 unsigned Opcode = Op0.getOpcode();
53502 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
53503 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
53504 MulOp0 = Op0.getOperand(0);
53505 MulOp1 = Op0.getOperand(1);
53506 IsConj = Opcode == X86ISD::VFCMULC;
53507 return true;
53508 }
53509 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
53511 HasNoSignedZero(Op0->getFlags())) ||
53512 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
53513 MulOp0 = Op0.getOperand(0);
53514 MulOp1 = Op0.getOperand(1);
53515 IsConj = Opcode == X86ISD::VFCMADDC;
53516 return true;
53517 }
53518 }
53519 return false;
53520 };
53521
53522 if (GetCFmulFrom(LHS))
53523 FAddOp1 = RHS;
53524 else if (GetCFmulFrom(RHS))
53525 FAddOp1 = LHS;
53526 else
53527 return SDValue();
53528
53529 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
53530 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
53531 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
53532 // FIXME: How do we handle when fast math flags of FADD are different from
53533 // CFMUL's?
53534 SDValue CFmul =
53535 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
53536 return DAG.getBitcast(VT, CFmul);
53537}
53538
53539/// Do target-specific dag combines on floating-point adds/subs.
53541 const X86Subtarget &Subtarget) {
53542 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
53543 return HOp;
53544
53545 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
53546 return COp;
53547
53548 return SDValue();
53549}
53550
53552 const X86Subtarget &Subtarget) {
53553 EVT VT = N->getValueType(0);
53554 SDValue Src = N->getOperand(0);
53555 EVT SrcVT = Src.getValueType();
53556 SDLoc DL(N);
53557
53558 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
53559 SrcVT != MVT::v2f32)
53560 return SDValue();
53561
53562 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
53563 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
53564 DAG.getUNDEF(SrcVT)));
53565}
53566
53567/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
53568/// the codegen.
53569/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
53570/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
53571/// anything that is guaranteed to be transformed by DAGCombiner.
53573 const X86Subtarget &Subtarget,
53574 const SDLoc &DL) {
53575 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
53576 SDValue Src = N->getOperand(0);
53577 unsigned SrcOpcode = Src.getOpcode();
53578 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53579
53580 EVT VT = N->getValueType(0);
53581 EVT SrcVT = Src.getValueType();
53582
53583 auto IsFreeTruncation = [VT](SDValue Op) {
53584 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
53585
53586 // See if this has been extended from a smaller/equal size to
53587 // the truncation size, allowing a truncation to combine with the extend.
53588 unsigned Opcode = Op.getOpcode();
53589 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
53590 Opcode == ISD::ZERO_EXTEND) &&
53591 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
53592 return true;
53593
53594 // See if this is a single use constant which can be constant folded.
53595 // NOTE: We don't peek throught bitcasts here because there is currently
53596 // no support for constant folding truncate+bitcast+vector_of_constants. So
53597 // we'll just send up with a truncate on both operands which will
53598 // get turned back into (truncate (binop)) causing an infinite loop.
53599 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
53600 };
53601
53602 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
53603 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
53604 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
53605 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
53606 };
53607
53608 // Don't combine if the operation has other uses.
53609 if (!Src.hasOneUse())
53610 return SDValue();
53611
53612 // Only support vector truncation for now.
53613 // TODO: i64 scalar math would benefit as well.
53614 if (!VT.isVector())
53615 return SDValue();
53616
53617 // In most cases its only worth pre-truncating if we're only facing the cost
53618 // of one truncation.
53619 // i.e. if one of the inputs will constant fold or the input is repeated.
53620 switch (SrcOpcode) {
53621 case ISD::MUL:
53622 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
53623 // better to truncate if we have the chance.
53624 if (SrcVT.getScalarType() == MVT::i64 &&
53625 TLI.isOperationLegal(SrcOpcode, VT) &&
53626 !TLI.isOperationLegal(SrcOpcode, SrcVT))
53627 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
53628 [[fallthrough]];
53629 case ISD::AND:
53630 case ISD::XOR:
53631 case ISD::OR:
53632 case ISD::ADD:
53633 case ISD::SUB: {
53634 SDValue Op0 = Src.getOperand(0);
53635 SDValue Op1 = Src.getOperand(1);
53636 if (TLI.isOperationLegal(SrcOpcode, VT) &&
53637 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
53638 return TruncateArithmetic(Op0, Op1);
53639 break;
53640 }
53641 }
53642
53643 return SDValue();
53644}
53645
53646// Try to form a MULHU or MULHS node by looking for
53647// (trunc (srl (mul ext, ext), 16))
53648// TODO: This is X86 specific because we want to be able to handle wide types
53649// before type legalization. But we can only do it if the vector will be
53650// legalized via widening/splitting. Type legalization can't handle promotion
53651// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
53652// combiner.
53653static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
53654 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
53655 using namespace llvm::SDPatternMatch;
53656
53657 if (!Subtarget.hasSSE2())
53658 return SDValue();
53659
53660 // Only handle vXi16 types that are at least 128-bits unless they will be
53661 // widened.
53662 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
53663 return SDValue();
53664
53665 // Input type should be at least vXi32.
53666 EVT InVT = Src.getValueType();
53667 if (InVT.getVectorElementType().getSizeInBits() < 32)
53668 return SDValue();
53669
53670 // First instruction should be a right shift by 16 of a multiply.
53671 SDValue LHS, RHS;
53672 if (!sd_match(Src,
53673 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16))))
53674 return SDValue();
53675
53676 // Count leading sign/zero bits on both inputs - if there are enough then
53677 // truncation back to vXi16 will be cheap - either as a pack/shuffle
53678 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
53679 // truncations may actually be free by peeking through to the ext source.
53680 auto IsSext = [&DAG](SDValue V) {
53681 return DAG.ComputeMaxSignificantBits(V) <= 16;
53682 };
53683 auto IsZext = [&DAG](SDValue V) {
53684 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
53685 };
53686
53687 bool IsSigned = IsSext(LHS) && IsSext(RHS);
53688 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
53689 if (!IsSigned && !IsUnsigned)
53690 return SDValue();
53691
53692 // Check if both inputs are extensions, which will be removed by truncation.
53693 auto isOpTruncateFree = [](SDValue Op) {
53694 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
53695 Op.getOpcode() == ISD::ZERO_EXTEND)
53696 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
53697 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
53698 };
53699 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
53700
53701 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
53702 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
53703 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
53704 // will have to split anyway.
53705 unsigned InSizeInBits = InVT.getSizeInBits();
53706 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
53707 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
53708 (InSizeInBits % 16) == 0) {
53709 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53710 InVT.getSizeInBits() / 16);
53711 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
53712 DAG.getBitcast(BCVT, RHS));
53713 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
53714 }
53715
53716 // Truncate back to source type.
53717 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
53718 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
53719
53720 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
53721 return DAG.getNode(Opc, DL, VT, LHS, RHS);
53722}
53723
53724// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
53725// from one vector with signed bytes from another vector, adds together
53726// adjacent pairs of 16-bit products, and saturates the result before
53727// truncating to 16-bits.
53728//
53729// Which looks something like this:
53730// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
53731// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
53733 const X86Subtarget &Subtarget,
53734 const SDLoc &DL) {
53735 if (!VT.isVector() || !Subtarget.hasSSSE3())
53736 return SDValue();
53737
53738 unsigned NumElems = VT.getVectorNumElements();
53739 EVT ScalarVT = VT.getVectorElementType();
53740 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
53741 return SDValue();
53742
53743 SDValue SSatVal = detectSSatPattern(In, VT);
53744 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
53745 return SDValue();
53746
53747 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
53748 // of multiplies from even/odd elements.
53749 SDValue N0 = SSatVal.getOperand(0);
53750 SDValue N1 = SSatVal.getOperand(1);
53751
53752 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53753 return SDValue();
53754
53755 SDValue N00 = N0.getOperand(0);
53756 SDValue N01 = N0.getOperand(1);
53757 SDValue N10 = N1.getOperand(0);
53758 SDValue N11 = N1.getOperand(1);
53759
53760 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
53761 // Canonicalize zero_extend to LHS.
53762 if (N01.getOpcode() == ISD::ZERO_EXTEND)
53763 std::swap(N00, N01);
53764 if (N11.getOpcode() == ISD::ZERO_EXTEND)
53765 std::swap(N10, N11);
53766
53767 // Ensure we have a zero_extend and a sign_extend.
53768 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
53769 N01.getOpcode() != ISD::SIGN_EXTEND ||
53770 N10.getOpcode() != ISD::ZERO_EXTEND ||
53771 N11.getOpcode() != ISD::SIGN_EXTEND)
53772 return SDValue();
53773
53774 // Peek through the extends.
53775 N00 = N00.getOperand(0);
53776 N01 = N01.getOperand(0);
53777 N10 = N10.getOperand(0);
53778 N11 = N11.getOperand(0);
53779
53780 // Ensure the extend is from vXi8.
53781 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
53782 N01.getValueType().getVectorElementType() != MVT::i8 ||
53783 N10.getValueType().getVectorElementType() != MVT::i8 ||
53784 N11.getValueType().getVectorElementType() != MVT::i8)
53785 return SDValue();
53786
53787 // All inputs should be build_vectors.
53788 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53789 N01.getOpcode() != ISD::BUILD_VECTOR ||
53790 N10.getOpcode() != ISD::BUILD_VECTOR ||
53792 return SDValue();
53793
53794 // N00/N10 are zero extended. N01/N11 are sign extended.
53795
53796 // For each element, we need to ensure we have an odd element from one vector
53797 // multiplied by the odd element of another vector and the even element from
53798 // one of the same vectors being multiplied by the even element from the
53799 // other vector. So we need to make sure for each element i, this operator
53800 // is being performed:
53801 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53802 SDValue ZExtIn, SExtIn;
53803 for (unsigned i = 0; i != NumElems; ++i) {
53804 SDValue N00Elt = N00.getOperand(i);
53805 SDValue N01Elt = N01.getOperand(i);
53806 SDValue N10Elt = N10.getOperand(i);
53807 SDValue N11Elt = N11.getOperand(i);
53808 // TODO: Be more tolerant to undefs.
53809 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53810 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53811 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53813 return SDValue();
53814 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53815 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53816 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53817 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53818 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53819 return SDValue();
53820 unsigned IdxN00 = ConstN00Elt->getZExtValue();
53821 unsigned IdxN01 = ConstN01Elt->getZExtValue();
53822 unsigned IdxN10 = ConstN10Elt->getZExtValue();
53823 unsigned IdxN11 = ConstN11Elt->getZExtValue();
53824 // Add is commutative so indices can be reordered.
53825 if (IdxN00 > IdxN10) {
53826 std::swap(IdxN00, IdxN10);
53827 std::swap(IdxN01, IdxN11);
53828 }
53829 // N0 indices be the even element. N1 indices must be the next odd element.
53830 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53831 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53832 return SDValue();
53833 SDValue N00In = N00Elt.getOperand(0);
53834 SDValue N01In = N01Elt.getOperand(0);
53835 SDValue N10In = N10Elt.getOperand(0);
53836 SDValue N11In = N11Elt.getOperand(0);
53837 // First time we find an input capture it.
53838 if (!ZExtIn) {
53839 ZExtIn = N00In;
53840 SExtIn = N01In;
53841 }
53842 if (ZExtIn != N00In || SExtIn != N01In ||
53843 ZExtIn != N10In || SExtIn != N11In)
53844 return SDValue();
53845 }
53846
53847 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
53848 EVT ExtVT = Ext.getValueType();
53849 if (ExtVT.getVectorNumElements() != NumElems * 2) {
53850 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
53851 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
53852 DAG.getVectorIdxConstant(0, DL));
53853 }
53854 };
53855 ExtractVec(ZExtIn);
53856 ExtractVec(SExtIn);
53857
53858 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53859 ArrayRef<SDValue> Ops) {
53860 // Shrink by adding truncate nodes and let DAGCombine fold with the
53861 // sources.
53862 EVT InVT = Ops[0].getValueType();
53863 assert(InVT.getScalarType() == MVT::i8 &&
53864 "Unexpected scalar element type");
53865 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
53866 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53867 InVT.getVectorNumElements() / 2);
53868 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
53869 };
53870 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
53871 PMADDBuilder);
53872}
53873
53875 const X86Subtarget &Subtarget) {
53876 EVT VT = N->getValueType(0);
53877 SDValue Src = N->getOperand(0);
53878 SDLoc DL(N);
53879
53880 // Attempt to pre-truncate inputs to arithmetic ops instead.
53881 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
53882 return V;
53883
53884 // Try to detect PMADD
53885 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
53886 return PMAdd;
53887
53888 // Try to combine truncation with signed/unsigned saturation.
53889 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
53890 return Val;
53891
53892 // Try to combine PMULHUW/PMULHW for vXi16.
53893 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
53894 return V;
53895
53896 // The bitcast source is a direct mmx result.
53897 // Detect bitcasts between i32 to x86mmx
53898 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
53899 SDValue BCSrc = Src.getOperand(0);
53900 if (BCSrc.getValueType() == MVT::x86mmx)
53901 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
53902 }
53903
53904 // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
53905 if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
53906 Src.hasOneUse())
53907 return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
53908
53909 return SDValue();
53910}
53911
53914 EVT VT = N->getValueType(0);
53915 SDValue In = N->getOperand(0);
53916 SDLoc DL(N);
53917
53918 if (SDValue SSatVal = detectSSatPattern(In, VT))
53919 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
53920 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
53921 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
53922
53923 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53924 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
53925 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53926 return SDValue(N, 0);
53927
53928 return SDValue();
53929}
53930
53931/// Returns the negated value if the node \p N flips sign of FP value.
53932///
53933/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53934/// or FSUB(0, x)
53935/// AVX512F does not have FXOR, so FNEG is lowered as
53936/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
53937/// In this case we go though all bitcasts.
53938/// This also recognizes splat of a negated value and returns the splat of that
53939/// value.
53940static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53941 if (N->getOpcode() == ISD::FNEG)
53942 return N->getOperand(0);
53943
53944 // Don't recurse exponentially.
53946 return SDValue();
53947
53948 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53949
53951 EVT VT = Op->getValueType(0);
53952
53953 // Make sure the element size doesn't change.
53954 if (VT.getScalarSizeInBits() != ScalarSize)
53955 return SDValue();
53956
53957 unsigned Opc = Op.getOpcode();
53958 switch (Opc) {
53959 case ISD::VECTOR_SHUFFLE: {
53960 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
53961 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
53962 if (!Op.getOperand(1).isUndef())
53963 return SDValue();
53964 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53965 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
53966 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
53967 cast<ShuffleVectorSDNode>(Op)->getMask());
53968 break;
53969 }
53971 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
53972 // -V, INDEX).
53973 SDValue InsVector = Op.getOperand(0);
53974 SDValue InsVal = Op.getOperand(1);
53975 if (!InsVector.isUndef())
53976 return SDValue();
53977 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
53978 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
53979 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
53980 NegInsVal, Op.getOperand(2));
53981 break;
53982 }
53983 case ISD::FSUB:
53984 case ISD::XOR:
53985 case X86ISD::FXOR: {
53986 SDValue Op1 = Op.getOperand(1);
53987 SDValue Op0 = Op.getOperand(0);
53988
53989 // For XOR and FXOR, we want to check if constant
53990 // bits of Op1 are sign bit masks. For FSUB, we
53991 // have to check if constant bits of Op0 are sign
53992 // bit masks and hence we swap the operands.
53993 if (Opc == ISD::FSUB)
53994 std::swap(Op0, Op1);
53995
53996 APInt UndefElts;
53997 SmallVector<APInt, 16> EltBits;
53998 // Extract constant bits and see if they are all
53999 // sign bit masks. Ignore the undef elements.
54000 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54001 /* AllowWholeUndefs */ true,
54002 /* AllowPartialUndefs */ false)) {
54003 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54004 if (!UndefElts[I] && !EltBits[I].isSignMask())
54005 return SDValue();
54006
54007 // Only allow bitcast from correctly-sized constant.
54008 Op0 = peekThroughBitcasts(Op0);
54009 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54010 return Op0;
54011 }
54012 break;
54013 } // case
54014 } // switch
54015
54016 return SDValue();
54017}
54018
54019static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54020 bool NegRes) {
54021 if (NegMul) {
54022 switch (Opcode) {
54023 // clang-format off
54024 default: llvm_unreachable("Unexpected opcode");
54025 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54026 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54027 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54028 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54029 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54030 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54031 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54032 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54033 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54034 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54035 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54036 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54037 // clang-format on
54038 }
54039 }
54040
54041 if (NegAcc) {
54042 switch (Opcode) {
54043 // clang-format off
54044 default: llvm_unreachable("Unexpected opcode");
54045 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54046 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54047 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54048 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54049 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54050 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54051 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54052 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54053 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54054 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54055 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54056 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54057 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54058 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54059 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54060 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54061 // clang-format on
54062 }
54063 }
54064
54065 if (NegRes) {
54066 switch (Opcode) {
54067 // For accuracy reason, we never combine fneg and fma under strict FP.
54068 // clang-format off
54069 default: llvm_unreachable("Unexpected opcode");
54070 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54071 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54072 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54073 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54074 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54075 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54076 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54077 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54078 // clang-format on
54079 }
54080 }
54081
54082 return Opcode;
54083}
54084
54085/// Do target-specific dag combines on floating point negations.
54088 const X86Subtarget &Subtarget) {
54089 EVT OrigVT = N->getValueType(0);
54090 SDValue Arg = isFNEG(DAG, N);
54091 if (!Arg)
54092 return SDValue();
54093
54094 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54095 EVT VT = Arg.getValueType();
54096 EVT SVT = VT.getScalarType();
54097 SDLoc DL(N);
54098
54099 // Let legalize expand this if it isn't a legal type yet.
54100 if (!TLI.isTypeLegal(VT))
54101 return SDValue();
54102
54103 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54104 // use of a constant by performing (-0 - A*B) instead.
54105 // FIXME: Check rounding control flags as well once it becomes available.
54106 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54107 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54108 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54109 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54110 Arg.getOperand(1), Zero);
54111 return DAG.getBitcast(OrigVT, NewNode);
54112 }
54113
54114 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54115 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54116 if (SDValue NegArg =
54117 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54118 return DAG.getBitcast(OrigVT, NegArg);
54119
54120 return SDValue();
54121}
54122
54124 bool LegalOperations,
54125 bool ForCodeSize,
54127 unsigned Depth) const {
54128 // fneg patterns are removable even if they have multiple uses.
54129 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54131 return DAG.getBitcast(Op.getValueType(), Arg);
54132 }
54133
54134 EVT VT = Op.getValueType();
54135 EVT SVT = VT.getScalarType();
54136 unsigned Opc = Op.getOpcode();
54137 SDNodeFlags Flags = Op.getNode()->getFlags();
54138 switch (Opc) {
54139 case ISD::FMA:
54140 case X86ISD::FMSUB:
54141 case X86ISD::FNMADD:
54142 case X86ISD::FNMSUB:
54143 case X86ISD::FMADD_RND:
54144 case X86ISD::FMSUB_RND:
54145 case X86ISD::FNMADD_RND:
54146 case X86ISD::FNMSUB_RND: {
54147 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54148 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54150 break;
54151
54152 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54153 // if it may have signed zeros.
54154 if (!Flags.hasNoSignedZeros())
54155 break;
54156
54157 // This is always negatible for free but we might be able to remove some
54158 // extra operand negations as well.
54160 for (int i = 0; i != 3; ++i)
54161 NewOps[i] = getCheaperNegatedExpression(
54162 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54163
54164 bool NegA = !!NewOps[0];
54165 bool NegB = !!NewOps[1];
54166 bool NegC = !!NewOps[2];
54167 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54168
54169 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54171
54172 // Fill in the non-negated ops with the original values.
54173 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54174 if (!NewOps[i])
54175 NewOps[i] = Op.getOperand(i);
54176 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54177 }
54178 case X86ISD::FRCP:
54179 if (SDValue NegOp0 =
54180 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54181 ForCodeSize, Cost, Depth + 1))
54182 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54183 break;
54184 }
54185
54186 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54187 ForCodeSize, Cost, Depth);
54188}
54189
54191 const X86Subtarget &Subtarget) {
54192 MVT VT = N->getSimpleValueType(0);
54193 // If we have integer vector types available, use the integer opcodes.
54194 if (!VT.isVector() || !Subtarget.hasSSE2())
54195 return SDValue();
54196
54197 SDLoc dl(N);
54198
54199 unsigned IntBits = VT.getScalarSizeInBits();
54200 MVT IntSVT = MVT::getIntegerVT(IntBits);
54201 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
54202
54203 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54204 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54205 unsigned IntOpcode;
54206 switch (N->getOpcode()) {
54207 // clang-format off
54208 default: llvm_unreachable("Unexpected FP logic op");
54209 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54210 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54211 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54212 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54213 // clang-format on
54214 }
54215 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54216 return DAG.getBitcast(VT, IntOp);
54217}
54218
54219
54220/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54222 if (N->getOpcode() != ISD::XOR)
54223 return SDValue();
54224
54225 SDValue LHS = N->getOperand(0);
54226 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54227 return SDValue();
54228
54230 X86::CondCode(LHS->getConstantOperandVal(0)));
54231 SDLoc DL(N);
54232 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54233}
54234
54236 const X86Subtarget &Subtarget) {
54237 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54238 "Invalid opcode for combing with CTLZ");
54239 if (Subtarget.hasFastLZCNT())
54240 return SDValue();
54241
54242 EVT VT = N->getValueType(0);
54243 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54244 (VT != MVT::i64 || !Subtarget.is64Bit()))
54245 return SDValue();
54246
54247 SDValue N0 = N->getOperand(0);
54248 SDValue N1 = N->getOperand(1);
54249
54250 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54252 return SDValue();
54253
54254 SDValue OpCTLZ;
54255 SDValue OpSizeTM1;
54256
54257 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54258 OpCTLZ = N1;
54259 OpSizeTM1 = N0;
54260 } else if (N->getOpcode() == ISD::SUB) {
54261 return SDValue();
54262 } else {
54263 OpCTLZ = N0;
54264 OpSizeTM1 = N1;
54265 }
54266
54267 if (!OpCTLZ.hasOneUse())
54268 return SDValue();
54269 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54270 if (!C)
54271 return SDValue();
54272
54273 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54274 return SDValue();
54275 EVT OpVT = VT;
54276 SDValue Op = OpCTLZ.getOperand(0);
54277 if (VT == MVT::i8) {
54278 // Zero extend to i32 since there is not an i8 bsr.
54279 OpVT = MVT::i32;
54280 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54281 }
54282
54283 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54284 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
54285 if (VT == MVT::i8)
54286 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
54287
54288 return Op;
54289}
54290
54293 const X86Subtarget &Subtarget) {
54294 SDValue N0 = N->getOperand(0);
54295 SDValue N1 = N->getOperand(1);
54296 EVT VT = N->getValueType(0);
54297 SDLoc DL(N);
54298
54299 // If this is SSE1 only convert to FXOR to avoid scalarization.
54300 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
54301 return DAG.getBitcast(MVT::v4i32,
54302 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
54303 DAG.getBitcast(MVT::v4f32, N0),
54304 DAG.getBitcast(MVT::v4f32, N1)));
54305 }
54306
54307 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
54308 return Cmp;
54309
54310 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54311 return R;
54312
54313 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54314 return R;
54315
54316 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
54317 return R;
54318
54319 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
54320 DAG, DCI, Subtarget))
54321 return FPLogic;
54322
54323 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
54324 return R;
54325
54326 if (DCI.isBeforeLegalizeOps())
54327 return SDValue();
54328
54329 if (SDValue SetCC = foldXor1SetCC(N, DAG))
54330 return SetCC;
54331
54332 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
54333 return R;
54334
54335 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
54336 return RV;
54337
54338 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
54339 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54340 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
54341 N0.getOperand(0).getValueType().isVector() &&
54342 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54343 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
54344 return DAG.getBitcast(
54345 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
54346 }
54347
54348 // Handle AVX512 mask widening.
54349 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
54350 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
54351 VT.getVectorElementType() == MVT::i1 &&
54353 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
54354 return DAG.getNode(
54356 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
54357 N0.getOperand(2));
54358 }
54359
54360 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
54361 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
54362 // TODO: Under what circumstances could this be performed in DAGCombine?
54363 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
54364 N0.getOperand(0).getOpcode() == N->getOpcode()) {
54365 SDValue TruncExtSrc = N0.getOperand(0);
54366 auto *N1C = dyn_cast<ConstantSDNode>(N1);
54367 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
54368 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
54369 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
54370 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
54371 return DAG.getNode(ISD::XOR, DL, VT, LHS,
54372 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
54373 }
54374 }
54375
54376 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
54377 return R;
54378
54379 return combineFneg(N, DAG, DCI, Subtarget);
54380}
54381
54384 const X86Subtarget &Subtarget) {
54385 SDValue N0 = N->getOperand(0);
54386 EVT VT = N->getValueType(0);
54387
54388 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
54389 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
54390 SDValue Src = N0.getOperand(0);
54391 EVT SrcVT = Src.getValueType();
54392 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
54393 (DCI.isBeforeLegalize() ||
54394 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
54395 Subtarget.hasSSSE3()) {
54396 unsigned NumElts = SrcVT.getVectorNumElements();
54397 SmallVector<int, 32> ReverseMask(NumElts);
54398 for (unsigned I = 0; I != NumElts; ++I)
54399 ReverseMask[I] = (NumElts - 1) - I;
54400 SDValue Rev =
54401 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
54402 return DAG.getBitcast(VT, Rev);
54403 }
54404 }
54405
54406 return SDValue();
54407}
54408
54409// Various combines to try to convert to avgceilu.
54412 const X86Subtarget &Subtarget) {
54413 unsigned Opcode = N->getOpcode();
54414 SDValue N0 = N->getOperand(0);
54415 SDValue N1 = N->getOperand(1);
54416 EVT VT = N->getValueType(0);
54417 EVT SVT = VT.getScalarType();
54418 SDLoc DL(N);
54419
54420 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
54421 // Only useful on vXi8 which doesn't have good SRA handling.
54422 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
54424 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
54425 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
54426 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
54427 return DAG.getNode(ISD::XOR, DL, VT,
54428 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
54429 }
54430
54431 return SDValue();
54432}
54433
54436 const X86Subtarget &Subtarget) {
54437 EVT VT = N->getValueType(0);
54438 unsigned NumBits = VT.getSizeInBits();
54439
54440 // TODO - Constant Folding.
54441
54442 // Simplify the inputs.
54443 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54444 APInt DemandedMask(APInt::getAllOnes(NumBits));
54445 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54446 return SDValue(N, 0);
54447
54448 return SDValue();
54449}
54450
54452 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
54453}
54454
54455/// If a value is a scalar FP zero or a vector FP zero (potentially including
54456/// undefined elements), return a zero constant that may be used to fold away
54457/// that value. In the case of a vector, the returned constant will not contain
54458/// undefined elements even if the input parameter does. This makes it suitable
54459/// to be used as a replacement operand with operations (eg, bitwise-and) where
54460/// an undef should not propagate.
54462 const X86Subtarget &Subtarget) {
54464 return SDValue();
54465
54466 if (V.getValueType().isVector())
54467 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
54468
54469 return V;
54470}
54471
54473 const X86Subtarget &Subtarget) {
54474 SDValue N0 = N->getOperand(0);
54475 SDValue N1 = N->getOperand(1);
54476 EVT VT = N->getValueType(0);
54477 SDLoc DL(N);
54478
54479 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
54480 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
54481 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
54482 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
54483 return SDValue();
54484
54485 auto isAllOnesConstantFP = [](SDValue V) {
54486 if (V.getSimpleValueType().isVector())
54487 return ISD::isBuildVectorAllOnes(V.getNode());
54488 auto *C = dyn_cast<ConstantFPSDNode>(V);
54489 return C && C->getConstantFPValue()->isAllOnesValue();
54490 };
54491
54492 // fand (fxor X, -1), Y --> fandn X, Y
54493 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
54494 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
54495
54496 // fand X, (fxor Y, -1) --> fandn Y, X
54497 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
54498 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
54499
54500 return SDValue();
54501}
54502
54503/// Do target-specific dag combines on X86ISD::FAND nodes.
54505 const X86Subtarget &Subtarget) {
54506 // FAND(0.0, x) -> 0.0
54507 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
54508 return V;
54509
54510 // FAND(x, 0.0) -> 0.0
54511 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54512 return V;
54513
54514 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
54515 return V;
54516
54517 return lowerX86FPLogicOp(N, DAG, Subtarget);
54518}
54519
54520/// Do target-specific dag combines on X86ISD::FANDN nodes.
54522 const X86Subtarget &Subtarget) {
54523 // FANDN(0.0, x) -> x
54524 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54525 return N->getOperand(1);
54526
54527 // FANDN(x, 0.0) -> 0.0
54528 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54529 return V;
54530
54531 return lowerX86FPLogicOp(N, DAG, Subtarget);
54532}
54533
54534/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
54537 const X86Subtarget &Subtarget) {
54538 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
54539
54540 // F[X]OR(0.0, x) -> x
54541 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54542 return N->getOperand(1);
54543
54544 // F[X]OR(x, 0.0) -> x
54545 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
54546 return N->getOperand(0);
54547
54548 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
54549 return NewVal;
54550
54551 return lowerX86FPLogicOp(N, DAG, Subtarget);
54552}
54553
54554/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
54556 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
54557
54558 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
54559 if (!DAG.getTarget().Options.NoNaNsFPMath ||
54561 return SDValue();
54562
54563 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
54564 // into FMINC and FMAXC, which are Commutative operations.
54565 unsigned NewOp = 0;
54566 switch (N->getOpcode()) {
54567 default: llvm_unreachable("unknown opcode");
54568 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
54569 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
54570 }
54571
54572 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
54573 N->getOperand(0), N->getOperand(1));
54574}
54575
54577 const X86Subtarget &Subtarget) {
54578 EVT VT = N->getValueType(0);
54579 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
54580 return SDValue();
54581
54582 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54583
54584 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
54585 (Subtarget.hasSSE2() && VT == MVT::f64) ||
54586 (Subtarget.hasFP16() && VT == MVT::f16) ||
54587 (VT.isVector() && TLI.isTypeLegal(VT))))
54588 return SDValue();
54589
54590 SDValue Op0 = N->getOperand(0);
54591 SDValue Op1 = N->getOperand(1);
54592 SDLoc DL(N);
54593 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
54594
54595 // If we don't have to respect NaN inputs, this is a direct translation to x86
54596 // min/max instructions.
54597 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
54598 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54599
54600 // If one of the operands is known non-NaN use the native min/max instructions
54601 // with the non-NaN input as second operand.
54602 if (DAG.isKnownNeverNaN(Op1))
54603 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54604 if (DAG.isKnownNeverNaN(Op0))
54605 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
54606
54607 // If we have to respect NaN inputs, this takes at least 3 instructions.
54608 // Favor a library call when operating on a scalar and minimizing code size.
54609 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
54610 return SDValue();
54611
54612 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
54613 VT);
54614
54615 // There are 4 possibilities involving NaN inputs, and these are the required
54616 // outputs:
54617 // Op1
54618 // Num NaN
54619 // ----------------
54620 // Num | Max | Op0 |
54621 // Op0 ----------------
54622 // NaN | Op1 | NaN |
54623 // ----------------
54624 //
54625 // The SSE FP max/min instructions were not designed for this case, but rather
54626 // to implement:
54627 // Min = Op1 < Op0 ? Op1 : Op0
54628 // Max = Op1 > Op0 ? Op1 : Op0
54629 //
54630 // So they always return Op0 if either input is a NaN. However, we can still
54631 // use those instructions for fmaxnum by selecting away a NaN input.
54632
54633 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
54634 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
54635 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
54636
54637 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
54638 // are NaN, the NaN value of Op1 is the result.
54639 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
54640}
54641
54644 EVT VT = N->getValueType(0);
54645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54646
54647 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54648 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54649 return SDValue(N, 0);
54650
54651 // Convert a full vector load into vzload when not all bits are needed.
54652 SDValue In = N->getOperand(0);
54653 MVT InVT = In.getSimpleValueType();
54654 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54655 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54656 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54657 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
54658 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54659 MVT MemVT = MVT::getIntegerVT(NumBits);
54660 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54661 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54662 SDLoc dl(N);
54663 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
54664 DAG.getBitcast(InVT, VZLoad));
54665 DCI.CombineTo(N, Convert);
54666 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54668 return SDValue(N, 0);
54669 }
54670 }
54671
54672 return SDValue();
54673}
54674
54678 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
54679 EVT VT = N->getValueType(0);
54680
54681 // Convert a full vector load into vzload when not all bits are needed.
54682 SDValue In = N->getOperand(IsStrict ? 1 : 0);
54683 MVT InVT = In.getSimpleValueType();
54684 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54685 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54686 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54687 LoadSDNode *LN = cast<LoadSDNode>(In);
54688 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54689 MVT MemVT = MVT::getFloatingPointVT(NumBits);
54690 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54691 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54692 SDLoc dl(N);
54693 if (IsStrict) {
54694 SDValue Convert =
54695 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
54696 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
54697 DCI.CombineTo(N, Convert, Convert.getValue(1));
54698 } else {
54699 SDValue Convert =
54700 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
54701 DCI.CombineTo(N, Convert);
54702 }
54703 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54705 return SDValue(N, 0);
54706 }
54707 }
54708
54709 return SDValue();
54710}
54711
54712/// Do target-specific dag combines on X86ISD::ANDNP nodes.
54715 const X86Subtarget &Subtarget) {
54716 SDValue N0 = N->getOperand(0);
54717 SDValue N1 = N->getOperand(1);
54718 MVT VT = N->getSimpleValueType(0);
54719 int NumElts = VT.getVectorNumElements();
54720 unsigned EltSizeInBits = VT.getScalarSizeInBits();
54721 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54722 SDLoc DL(N);
54723
54724 // ANDNP(undef, x) -> 0
54725 // ANDNP(x, undef) -> 0
54726 if (N0.isUndef() || N1.isUndef())
54727 return DAG.getConstant(0, DL, VT);
54728
54729 // ANDNP(0, x) -> x
54731 return N1;
54732
54733 // ANDNP(x, 0) -> 0
54735 return DAG.getConstant(0, DL, VT);
54736
54737 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
54739 return DAG.getNOT(DL, N0, VT);
54740
54741 // Turn ANDNP back to AND if input is inverted.
54742 if (SDValue Not = IsNOT(N0, DAG))
54743 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
54744
54745 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
54746 // to make use of predicated selects.
54747 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
54748 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
54749 SDValue Src = N0.getOperand(0);
54750 EVT SrcVT = Src.getValueType();
54751 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
54752 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
54753 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
54754 getZeroVector(VT, Subtarget, DAG, DL));
54755 }
54756
54757 // Constant Folding
54758 APInt Undefs0, Undefs1;
54759 SmallVector<APInt> EltBits0, EltBits1;
54760 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
54761 /*AllowWholeUndefs*/ true,
54762 /*AllowPartialUndefs*/ true)) {
54763 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
54764 /*AllowWholeUndefs*/ true,
54765 /*AllowPartialUndefs*/ true)) {
54766 SmallVector<APInt> ResultBits;
54767 for (int I = 0; I != NumElts; ++I)
54768 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
54769 return getConstVector(ResultBits, VT, DAG, DL);
54770 }
54771
54772 // Constant fold NOT(N0) to allow us to use AND.
54773 // Ensure this is only performed if we can confirm that the bitcasted source
54774 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
54775 if (N0->hasOneUse()) {
54777 if (BC0.getOpcode() != ISD::BITCAST) {
54778 for (APInt &Elt : EltBits0)
54779 Elt = ~Elt;
54780 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
54781 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
54782 }
54783 }
54784 }
54785
54786 // Attempt to recursively combine a bitmask ANDNP with shuffles.
54787 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
54788 SDValue Op(N, 0);
54789 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54790 return Res;
54791
54792 // If either operand is a constant mask, then only the elements that aren't
54793 // zero are actually demanded by the other operand.
54794 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
54795 APInt UndefElts;
54796 SmallVector<APInt> EltBits;
54797 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
54798 APInt DemandedElts = APInt::getAllOnes(NumElts);
54799 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
54800 EltBits)) {
54801 DemandedBits.clearAllBits();
54802 DemandedElts.clearAllBits();
54803 for (int I = 0; I != NumElts; ++I) {
54804 if (UndefElts[I]) {
54805 // We can't assume an undef src element gives an undef dst - the
54806 // other src might be zero.
54807 DemandedBits.setAllBits();
54808 DemandedElts.setBit(I);
54809 } else if ((Invert && !EltBits[I].isAllOnes()) ||
54810 (!Invert && !EltBits[I].isZero())) {
54811 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
54812 DemandedElts.setBit(I);
54813 }
54814 }
54815 }
54816 return std::make_pair(DemandedBits, DemandedElts);
54817 };
54818 APInt Bits0, Elts0;
54819 APInt Bits1, Elts1;
54820 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
54821 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
54822
54823 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
54824 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
54825 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
54826 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
54827 if (N->getOpcode() != ISD::DELETED_NODE)
54828 DCI.AddToWorklist(N);
54829 return SDValue(N, 0);
54830 }
54831 }
54832
54833 // Folds for better commutativity:
54834 if (N1->hasOneUse()) {
54835 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
54836 if (SDValue Not = IsNOT(N1, DAG))
54837 return DAG.getNOT(
54838 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
54839
54840 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
54841 // Zero out elements by setting the PSHUFB mask value to 0xFF.
54842 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
54844 if (BC1.getOpcode() == X86ISD::PSHUFB) {
54845 EVT ShufVT = BC1.getValueType();
54846 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
54847 DAG.getBitcast(ShufVT, N0));
54848 SDValue NewShuf =
54849 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
54850 return DAG.getBitcast(VT, NewShuf);
54851 }
54852 }
54853 }
54854
54855 return SDValue();
54856}
54857
54860 SDValue N1 = N->getOperand(1);
54861
54862 // BT ignores high bits in the bit index operand.
54863 unsigned BitWidth = N1.getValueSizeInBits();
54865 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
54866 if (N->getOpcode() != ISD::DELETED_NODE)
54867 DCI.AddToWorklist(N);
54868 return SDValue(N, 0);
54869 }
54870
54871 return SDValue();
54872}
54873
54876 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54877 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54878
54879 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54880 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54881 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
54882 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
54883 if (N->getOpcode() != ISD::DELETED_NODE)
54884 DCI.AddToWorklist(N);
54885 return SDValue(N, 0);
54886 }
54887
54888 // Convert a full vector load into vzload when not all bits are needed.
54889 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54890 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54891 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
54892 SDLoc dl(N);
54893 if (IsStrict) {
54894 SDValue Convert = DAG.getNode(
54895 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54896 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54897 DCI.CombineTo(N, Convert, Convert.getValue(1));
54898 } else {
54899 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54900 DAG.getBitcast(MVT::v8i16, VZLoad));
54901 DCI.CombineTo(N, Convert);
54902 }
54903
54904 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54906 return SDValue(N, 0);
54907 }
54908 }
54909 }
54910
54911 return SDValue();
54912}
54913
54914// Try to combine sext_in_reg of a cmov of constants by extending the constants.
54916 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54917
54918 EVT DstVT = N->getValueType(0);
54919
54920 SDValue N0 = N->getOperand(0);
54921 SDValue N1 = N->getOperand(1);
54922 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54923
54924 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
54925 return SDValue();
54926
54927 // Look through single use any_extends / truncs.
54928 SDValue IntermediateBitwidthOp;
54929 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
54930 N0.hasOneUse()) {
54931 IntermediateBitwidthOp = N0;
54932 N0 = N0.getOperand(0);
54933 }
54934
54935 // See if we have a single use cmov.
54936 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
54937 return SDValue();
54938
54939 SDValue CMovOp0 = N0.getOperand(0);
54940 SDValue CMovOp1 = N0.getOperand(1);
54941
54942 // Make sure both operands are constants.
54943 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54944 !isa<ConstantSDNode>(CMovOp1.getNode()))
54945 return SDValue();
54946
54947 SDLoc DL(N);
54948
54949 // If we looked through an any_extend/trunc above, add one to the constants.
54950 if (IntermediateBitwidthOp) {
54951 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
54952 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
54953 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
54954 }
54955
54956 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
54957 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
54958
54959 EVT CMovVT = DstVT;
54960 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
54961 if (DstVT == MVT::i16) {
54962 CMovVT = MVT::i32;
54963 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
54964 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
54965 }
54966
54967 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
54968 N0.getOperand(2), N0.getOperand(3));
54969
54970 if (CMovVT != DstVT)
54971 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
54972
54973 return CMov;
54974}
54975
54977 const X86Subtarget &Subtarget) {
54978 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54979
54980 if (SDValue V = combineSextInRegCmov(N, DAG))
54981 return V;
54982
54983 EVT VT = N->getValueType(0);
54984 SDValue N0 = N->getOperand(0);
54985 SDValue N1 = N->getOperand(1);
54986 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54987 SDLoc dl(N);
54988
54989 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
54990 // both SSE and AVX2 since there is no sign-extended shift right
54991 // operation on a vector with 64-bit elements.
54992 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
54993 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
54994 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
54995 N0.getOpcode() == ISD::SIGN_EXTEND)) {
54996 SDValue N00 = N0.getOperand(0);
54997
54998 // EXTLOAD has a better solution on AVX2,
54999 // it may be replaced with X86ISD::VSEXT node.
55000 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55001 if (!ISD::isNormalLoad(N00.getNode()))
55002 return SDValue();
55003
55004 // Attempt to promote any comparison mask ops before moving the
55005 // SIGN_EXTEND_INREG in the way.
55006 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55007 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55008
55009 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55010 SDValue Tmp =
55011 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55012 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55013 }
55014 }
55015 return SDValue();
55016}
55017
55018/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55019/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55020/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55021/// opportunities to combine math ops, use an LEA, or use a complex addressing
55022/// mode. This can eliminate extend, add, and shift instructions.
55024 const X86Subtarget &Subtarget) {
55025 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55026 Ext->getOpcode() != ISD::ZERO_EXTEND)
55027 return SDValue();
55028
55029 // TODO: This should be valid for other integer types.
55030 EVT VT = Ext->getValueType(0);
55031 if (VT != MVT::i64)
55032 return SDValue();
55033
55034 SDValue Add = Ext->getOperand(0);
55035 if (Add.getOpcode() != ISD::ADD)
55036 return SDValue();
55037
55038 SDValue AddOp0 = Add.getOperand(0);
55039 SDValue AddOp1 = Add.getOperand(1);
55040 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55041 bool NSW = Add->getFlags().hasNoSignedWrap();
55042 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55043 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55044 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55045
55046 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55047 // into the 'zext'
55048 if ((Sext && !NSW) || (!Sext && !NUW))
55049 return SDValue();
55050
55051 // Having a constant operand to the 'add' ensures that we are not increasing
55052 // the instruction count because the constant is extended for free below.
55053 // A constant operand can also become the displacement field of an LEA.
55054 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55055 if (!AddOp1C)
55056 return SDValue();
55057
55058 // Don't make the 'add' bigger if there's no hope of combining it with some
55059 // other 'add' or 'shl' instruction.
55060 // TODO: It may be profitable to generate simpler LEA instructions in place
55061 // of single 'add' instructions, but the cost model for selecting an LEA
55062 // currently has a high threshold.
55063 bool HasLEAPotential = false;
55064 for (auto *User : Ext->users()) {
55065 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55066 HasLEAPotential = true;
55067 break;
55068 }
55069 }
55070 if (!HasLEAPotential)
55071 return SDValue();
55072
55073 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55074 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55075 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55076 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55077
55078 // The wider add is guaranteed to not wrap because both operands are
55079 // sign-extended.
55080 SDNodeFlags Flags;
55081 Flags.setNoSignedWrap(NSW);
55082 Flags.setNoUnsignedWrap(NUW);
55083 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55084}
55085
55086// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55087// operands and the result of CMOV is not used anywhere else - promote CMOV
55088// itself instead of promoting its result. This could be beneficial, because:
55089// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55090// (or more) pseudo-CMOVs only when they go one-after-another and
55091// getting rid of result extension code after CMOV will help that.
55092// 2) Promotion of constant CMOV arguments is free, hence the
55093// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55094// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55095// promotion is also good in terms of code-size.
55096// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55097// promotion).
55099 SDValue CMovN = Extend->getOperand(0);
55100 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55101 return SDValue();
55102
55103 EVT TargetVT = Extend->getValueType(0);
55104 unsigned ExtendOpcode = Extend->getOpcode();
55105 SDLoc DL(Extend);
55106
55107 EVT VT = CMovN.getValueType();
55108 SDValue CMovOp0 = CMovN.getOperand(0);
55109 SDValue CMovOp1 = CMovN.getOperand(1);
55110
55111 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55112 !isa<ConstantSDNode>(CMovOp1.getNode()))
55113 return SDValue();
55114
55115 // Only extend to i32 or i64.
55116 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55117 return SDValue();
55118
55119 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55120 // are free.
55121 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55122 return SDValue();
55123
55124 // If this a zero extend to i64, we should only extend to i32 and use a free
55125 // zero extend to finish.
55126 EVT ExtendVT = TargetVT;
55127 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55128 ExtendVT = MVT::i32;
55129
55130 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55131 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55132
55133 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55134 CMovN.getOperand(2), CMovN.getOperand(3));
55135
55136 // Finish extending if needed.
55137 if (ExtendVT != TargetVT)
55138 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55139
55140 return Res;
55141}
55142
55143// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55144// result type.
55146 const X86Subtarget &Subtarget) {
55147 SDValue N0 = N->getOperand(0);
55148 EVT VT = N->getValueType(0);
55149 SDLoc dl(N);
55150
55151 // Only do this combine with AVX512 for vector extends.
55152 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55153 return SDValue();
55154
55155 // Only combine legal element types.
55156 EVT SVT = VT.getVectorElementType();
55157 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55158 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55159 return SDValue();
55160
55161 // We don't have CMPP Instruction for vxf16
55162 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55163 return SDValue();
55164 // We can only do this if the vector size in 256 bits or less.
55165 unsigned Size = VT.getSizeInBits();
55166 if (Size > 256 && Subtarget.useAVX512Regs())
55167 return SDValue();
55168
55169 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55170 // that's the only integer compares with we have.
55171 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
55173 return SDValue();
55174
55175 // Only do this combine if the extension will be fully consumed by the setcc.
55176 EVT N00VT = N0.getOperand(0).getValueType();
55177 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55178 if (Size != MatchingVecType.getSizeInBits())
55179 return SDValue();
55180
55181 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55182
55183 if (N->getOpcode() == ISD::ZERO_EXTEND)
55184 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55185
55186 return Res;
55187}
55188
55191 const X86Subtarget &Subtarget) {
55192 SDValue N0 = N->getOperand(0);
55193 EVT VT = N->getValueType(0);
55194 SDLoc DL(N);
55195
55196 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55197 if (!DCI.isBeforeLegalizeOps() &&
55199 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55200 N0->getOperand(1));
55201 bool ReplaceOtherUses = !N0.hasOneUse();
55202 DCI.CombineTo(N, Setcc);
55203 // Replace other uses with a truncate of the widened setcc_carry.
55204 if (ReplaceOtherUses) {
55205 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55206 N0.getValueType(), Setcc);
55207 DCI.CombineTo(N0.getNode(), Trunc);
55208 }
55209
55210 return SDValue(N, 0);
55211 }
55212
55213 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55214 return NewCMov;
55215
55216 if (!DCI.isBeforeLegalizeOps())
55217 return SDValue();
55218
55219 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55220 return V;
55221
55222 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55223 DAG, DCI, Subtarget))
55224 return V;
55225
55226 if (VT.isVector()) {
55227 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55228 return R;
55229
55231 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55232 }
55233
55234 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55235 return NewAdd;
55236
55237 return SDValue();
55238}
55239
55240// Inverting a constant vector is profitable if it can be eliminated and the
55241// inverted vector is already present in DAG. Otherwise, it will be loaded
55242// anyway.
55243//
55244// We determine which of the values can be completely eliminated and invert it.
55245// If both are eliminable, select a vector with the first negative element.
55248 "ConstantFP build vector expected");
55249 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55250 // can eliminate it. Since this function is invoked for each FMA with this
55251 // vector.
55252 auto IsNotFMA = [](SDNode *User) {
55253 return User->getOpcode() != ISD::FMA &&
55254 User->getOpcode() != ISD::STRICT_FMA;
55255 };
55256 if (llvm::any_of(V->users(), IsNotFMA))
55257 return SDValue();
55258
55260 EVT VT = V.getValueType();
55261 EVT EltVT = VT.getVectorElementType();
55262 for (const SDValue &Op : V->op_values()) {
55263 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55264 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55265 } else {
55266 assert(Op.isUndef());
55267 Ops.push_back(DAG.getUNDEF(EltVT));
55268 }
55269 }
55270
55271 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
55272 if (!NV)
55273 return SDValue();
55274
55275 // If an inverted version cannot be eliminated, choose it instead of the
55276 // original version.
55277 if (llvm::any_of(NV->users(), IsNotFMA))
55278 return SDValue(NV, 0);
55279
55280 // If the inverted version also can be eliminated, we have to consistently
55281 // prefer one of the values. We prefer a constant with a negative value on
55282 // the first place.
55283 // N.B. We need to skip undefs that may precede a value.
55284 for (const SDValue &Op : V->op_values()) {
55285 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55286 if (Cst->isNegative())
55287 return SDValue();
55288 break;
55289 }
55290 }
55291 return SDValue(NV, 0);
55292}
55293
55296 const X86Subtarget &Subtarget) {
55297 SDLoc dl(N);
55298 EVT VT = N->getValueType(0);
55300 bool IsStrict = N->isTargetOpcode()
55301 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55302 : N->isStrictFPOpcode();
55303
55304 // Let legalize expand this if it isn't a legal type yet.
55305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55306 if (!TLI.isTypeLegal(VT))
55307 return SDValue();
55308
55309 SDValue A = N->getOperand(IsStrict ? 1 : 0);
55310 SDValue B = N->getOperand(IsStrict ? 2 : 1);
55311 SDValue C = N->getOperand(IsStrict ? 3 : 2);
55312
55313 // If the operation allows fast-math and the target does not support FMA,
55314 // split this into mul+add to avoid libcall(s).
55315 SDNodeFlags Flags = N->getFlags();
55316 if (!IsStrict && Flags.hasAllowReassociation() &&
55317 TLI.isOperationExpand(ISD::FMA, VT)) {
55318 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
55319 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
55320 }
55321
55322 EVT ScalarVT = VT.getScalarType();
55323 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
55324 !Subtarget.hasAnyFMA()) &&
55325 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
55326 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
55327 return SDValue();
55328
55329 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
55330 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
55331 bool LegalOperations = !DCI.isBeforeLegalizeOps();
55332 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
55333 CodeSize)) {
55334 V = NegV;
55335 return true;
55336 }
55337 // Look through extract_vector_elts. If it comes from an FNEG, create a
55338 // new extract from the FNEG input.
55339 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55340 isNullConstant(V.getOperand(1))) {
55341 SDValue Vec = V.getOperand(0);
55342 if (SDValue NegV = TLI.getCheaperNegatedExpression(
55343 Vec, DAG, LegalOperations, CodeSize)) {
55344 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
55345 NegV, V.getOperand(1));
55346 return true;
55347 }
55348 }
55349 // Lookup if there is an inverted version of constant vector V in DAG.
55350 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
55351 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
55352 V = NegV;
55353 return true;
55354 }
55355 }
55356 return false;
55357 };
55358
55359 // Do not convert the passthru input of scalar intrinsics.
55360 // FIXME: We could allow negations of the lower element only.
55361 bool NegA = invertIfNegative(A);
55362 bool NegB = invertIfNegative(B);
55363 bool NegC = invertIfNegative(C);
55364
55365 if (!NegA && !NegB && !NegC)
55366 return SDValue();
55367
55368 unsigned NewOpcode =
55369 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
55370
55371 // Propagate fast-math-flags to new FMA node.
55372 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
55373 if (IsStrict) {
55374 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
55375 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
55376 {N->getOperand(0), A, B, C});
55377 } else {
55378 if (N->getNumOperands() == 4)
55379 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
55380 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
55381 }
55382}
55383
55384// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
55385// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
55388 SDLoc dl(N);
55389 EVT VT = N->getValueType(0);
55390 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55391 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
55392 bool LegalOperations = !DCI.isBeforeLegalizeOps();
55393
55394 SDValue N2 = N->getOperand(2);
55395
55396 SDValue NegN2 =
55397 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
55398 if (!NegN2)
55399 return SDValue();
55400 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
55401
55402 if (N->getNumOperands() == 4)
55403 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55404 NegN2, N->getOperand(3));
55405 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
55406 NegN2);
55407}
55408
55411 const X86Subtarget &Subtarget) {
55412 SDLoc dl(N);
55413 SDValue N0 = N->getOperand(0);
55414 EVT VT = N->getValueType(0);
55415
55416 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55417 // FIXME: Is this needed? We don't seem to have any tests for it.
55418 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
55420 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
55421 N0->getOperand(1));
55422 bool ReplaceOtherUses = !N0.hasOneUse();
55423 DCI.CombineTo(N, Setcc);
55424 // Replace other uses with a truncate of the widened setcc_carry.
55425 if (ReplaceOtherUses) {
55426 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55427 N0.getValueType(), Setcc);
55428 DCI.CombineTo(N0.getNode(), Trunc);
55429 }
55430
55431 return SDValue(N, 0);
55432 }
55433
55434 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55435 return NewCMov;
55436
55437 if (DCI.isBeforeLegalizeOps())
55438 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55439 return V;
55440
55441 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
55442 DAG, DCI, Subtarget))
55443 return V;
55444
55445 if (VT.isVector())
55446 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
55447 return R;
55448
55449 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55450 return NewAdd;
55451
55452 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
55453 return R;
55454
55455 // TODO: Combine with any target/faux shuffle.
55456 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
55458 SDValue N00 = N0.getOperand(0);
55459 SDValue N01 = N0.getOperand(1);
55460 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
55461 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
55462 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
55463 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
55464 return concatSubVectors(N00, N01, DAG, dl);
55465 }
55466 }
55467
55468 return SDValue();
55469}
55470
55471/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
55472/// pre-promote its result type since vXi1 vectors don't get promoted
55473/// during type legalization.
55476 const SDLoc &DL, SelectionDAG &DAG,
55477 const X86Subtarget &Subtarget) {
55478 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
55479 VT.getVectorElementType() == MVT::i1 &&
55480 (OpVT.getVectorElementType() == MVT::i8 ||
55481 OpVT.getVectorElementType() == MVT::i16)) {
55482 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
55483 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
55484 }
55485 return SDValue();
55486}
55487
55490 const X86Subtarget &Subtarget) {
55491 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
55492 const SDValue LHS = N->getOperand(0);
55493 const SDValue RHS = N->getOperand(1);
55494 EVT VT = N->getValueType(0);
55495 EVT OpVT = LHS.getValueType();
55496 SDLoc DL(N);
55497
55498 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
55499 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
55500 Subtarget))
55501 return V;
55502
55503 if (VT == MVT::i1) {
55504 X86::CondCode X86CC;
55505 if (SDValue V =
55506 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
55507 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
55508 }
55509
55510 if (OpVT.isScalarInteger()) {
55511 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
55512 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
55513 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
55514 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
55515 if (N0.getOperand(0) == N1)
55516 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55517 N0.getOperand(1));
55518 if (N0.getOperand(1) == N1)
55519 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55520 N0.getOperand(0));
55521 }
55522 return SDValue();
55523 };
55524 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
55525 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55526 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
55527 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55528
55529 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
55530 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
55531 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
55532 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
55533 if (N0.getOperand(0) == N1)
55534 return DAG.getNode(ISD::AND, DL, OpVT, N1,
55535 DAG.getNOT(DL, N0.getOperand(1), OpVT));
55536 if (N0.getOperand(1) == N1)
55537 return DAG.getNode(ISD::AND, DL, OpVT, N1,
55538 DAG.getNOT(DL, N0.getOperand(0), OpVT));
55539 }
55540 return SDValue();
55541 };
55542 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
55543 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55544 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
55545 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55546
55547 // cmpeq(trunc(x),C) --> cmpeq(x,C)
55548 // cmpne(trunc(x),C) --> cmpne(x,C)
55549 // iff x upper bits are zero.
55550 if (LHS.getOpcode() == ISD::TRUNCATE &&
55551 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
55552 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
55553 EVT SrcVT = LHS.getOperand(0).getValueType();
55555 OpVT.getScalarSizeInBits());
55556 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55557 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
55558 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
55559 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
55560 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
55561 }
55562
55563 // With C as a power of 2 and C != 0 and C != INT_MIN:
55564 // icmp eq Abs(X) C ->
55565 // (icmp eq A, C) | (icmp eq A, -C)
55566 // icmp ne Abs(X) C ->
55567 // (icmp ne A, C) & (icmp ne A, -C)
55568 // Both of these patterns can be better optimized in
55569 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
55570 // integers which is checked above.
55571 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
55572 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
55573 const APInt &CInt = C->getAPIntValue();
55574 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
55575 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
55576 SDValue BaseOp = LHS.getOperand(0);
55577 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
55578 SDValue SETCC1 = DAG.getSetCC(
55579 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
55580 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
55581 SETCC0, SETCC1);
55582 }
55583 }
55584 }
55585 }
55586 }
55587
55588 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
55590 // Using temporaries to avoid messing up operand ordering for later
55591 // transformations if this doesn't work.
55592 SDValue Op0 = LHS;
55593 SDValue Op1 = RHS;
55594 ISD::CondCode TmpCC = CC;
55595 // Put build_vector on the right.
55596 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
55597 std::swap(Op0, Op1);
55598 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
55599 }
55600
55601 bool IsSEXT0 =
55602 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
55603 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
55604 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
55605
55606 if (IsSEXT0 && IsVZero1) {
55607 assert(VT == Op0.getOperand(0).getValueType() &&
55608 "Unexpected operand type");
55609 if (TmpCC == ISD::SETGT)
55610 return DAG.getConstant(0, DL, VT);
55611 if (TmpCC == ISD::SETLE)
55612 return DAG.getConstant(1, DL, VT);
55613 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
55614 return DAG.getNOT(DL, Op0.getOperand(0), VT);
55615
55616 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
55617 "Unexpected condition code!");
55618 return Op0.getOperand(0);
55619 }
55620 }
55621
55622 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
55623 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
55624 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
55625 // a mask, there are signed AVX512 comparisons).
55626 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
55627 bool CanMakeSigned = false;
55629 KnownBits CmpKnown =
55631 // If we know LHS/RHS share the same sign bit at each element we can
55632 // make this signed.
55633 // NOTE: `computeKnownBits` on a vector type aggregates common bits
55634 // across all lanes. So a pattern where the sign varies from lane to
55635 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
55636 // missed. We could get around this by demanding each lane
55637 // independently, but this isn't the most important optimization and
55638 // that may eat into compile time.
55639 CanMakeSigned =
55640 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
55641 }
55642 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
55643 SDValue LHSOut = LHS;
55644 SDValue RHSOut = RHS;
55645 ISD::CondCode NewCC = CC;
55646 switch (CC) {
55647 case ISD::SETGE:
55648 case ISD::SETUGE:
55649 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
55650 /*NSW*/ true))
55651 LHSOut = NewLHS;
55652 else if (SDValue NewRHS = incDecVectorConstant(
55653 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
55654 RHSOut = NewRHS;
55655 else
55656 break;
55657
55658 [[fallthrough]];
55659 case ISD::SETUGT:
55660 NewCC = ISD::SETGT;
55661 break;
55662
55663 case ISD::SETLE:
55664 case ISD::SETULE:
55665 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
55666 /*NSW*/ true))
55667 LHSOut = NewLHS;
55668 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
55669 /*NSW*/ true))
55670 RHSOut = NewRHS;
55671 else
55672 break;
55673
55674 [[fallthrough]];
55675 case ISD::SETULT:
55676 // Will be swapped to SETGT in LowerVSETCC*.
55677 NewCC = ISD::SETLT;
55678 break;
55679 default:
55680 break;
55681 }
55682 if (NewCC != CC) {
55683 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
55684 NewCC, DL, DAG, Subtarget))
55685 return R;
55686 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
55687 }
55688 }
55689 }
55690
55691 if (SDValue R =
55692 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
55693 return R;
55694
55695 // In the middle end transforms:
55696 // `(or (icmp eq X, C), (icmp eq X, C+1))`
55697 // -> `(icmp ult (add x, -C), 2)`
55698 // Likewise inverted cases with `ugt`.
55699 //
55700 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
55701 // in worse codegen. So, undo the middle-end transform and go back to `(or
55702 // (icmp eq), (icmp eq))` form.
55703 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
55704 // the xmm approach.
55705 //
55706 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
55707 // ne))` as it doesn't end up instruction positive.
55708 // TODO: We might want to do this for avx512 as well if we `sext` the result.
55709 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
55710 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
55711 !Subtarget.hasAVX512() &&
55712 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
55713 Subtarget.hasAVX2()) &&
55714 LHS.hasOneUse()) {
55715
55716 APInt CmpC;
55717 SDValue AddC = LHS.getOperand(1);
55718 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
55720 // See which form we have depending on the constant/condition.
55721 SDValue C0 = SDValue();
55722 SDValue C1 = SDValue();
55723
55724 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
55725 // we will end up generating an additional constant. Keeping in the
55726 // current form has a slight latency cost, but it probably worth saving a
55727 // constant.
55730 // Pass
55731 }
55732 // Normal Cases
55733 else if ((CC == ISD::SETULT && CmpC == 2) ||
55734 (CC == ISD::SETULE && CmpC == 1)) {
55735 // These will constant fold.
55736 C0 = DAG.getNegative(AddC, DL, OpVT);
55737 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
55738 DAG.getAllOnesConstant(DL, OpVT));
55739 }
55740 // Inverted Cases
55741 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
55742 (CC == ISD::SETUGE && (-CmpC) == 2)) {
55743 // These will constant fold.
55744 C0 = DAG.getNOT(DL, AddC, OpVT);
55745 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
55746 DAG.getAllOnesConstant(DL, OpVT));
55747 }
55748 if (C0 && C1) {
55749 SDValue NewLHS =
55750 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
55751 SDValue NewRHS =
55752 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
55753 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
55754 }
55755 }
55756 }
55757
55758 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
55759 // to avoid scalarization via legalization because v4i32 is not a legal type.
55760 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
55761 LHS.getValueType() == MVT::v4f32)
55762 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
55763
55764 // X pred 0.0 --> X pred -X
55765 // If the negation of X already exists, use it in the comparison. This removes
55766 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
55767 // instructions in patterns with a 'select' node.
55769 SDVTList FNegVT = DAG.getVTList(OpVT);
55770 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
55771 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
55772 }
55773
55774 return SDValue();
55775}
55776
55779 const X86Subtarget &Subtarget) {
55780 SDValue Src = N->getOperand(0);
55781 MVT SrcVT = Src.getSimpleValueType();
55782 MVT VT = N->getSimpleValueType(0);
55783 unsigned NumBits = VT.getScalarSizeInBits();
55784 unsigned NumElts = SrcVT.getVectorNumElements();
55785 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
55786 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
55787
55788 // Perform constant folding.
55789 APInt UndefElts;
55790 SmallVector<APInt, 32> EltBits;
55791 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
55792 /*AllowWholeUndefs*/ true,
55793 /*AllowPartialUndefs*/ true)) {
55794 APInt Imm(32, 0);
55795 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
55796 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55797 Imm.setBit(Idx);
55798
55799 return DAG.getConstant(Imm, SDLoc(N), VT);
55800 }
55801
55802 // Look through int->fp bitcasts that don't change the element width.
55803 unsigned EltWidth = SrcVT.getScalarSizeInBits();
55804 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
55805 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
55806 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
55807
55808 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
55809 // with scalar comparisons.
55810 if (SDValue NotSrc = IsNOT(Src, DAG)) {
55811 SDLoc DL(N);
55812 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55813 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
55814 return DAG.getNode(ISD::XOR, DL, VT,
55815 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
55816 DAG.getConstant(NotMask, DL, VT));
55817 }
55818
55819 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
55820 // results with scalar comparisons.
55821 if (Src.getOpcode() == X86ISD::PCMPGT &&
55822 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
55823 SDLoc DL(N);
55824 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55825 return DAG.getNode(ISD::XOR, DL, VT,
55826 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
55827 DAG.getConstant(NotMask, DL, VT));
55828 }
55829
55830 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
55831 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
55832 // iff pow2splat(c1).
55833 // Use KnownBits to determine if only a single bit is non-zero
55834 // in each element (pow2 or zero), and shift that bit to the msb.
55835 if (Src.getOpcode() == X86ISD::PCMPEQ) {
55836 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
55837 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
55838 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
55839 if (KnownLHS.countMaxPopulation() == 1 &&
55840 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
55841 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
55842 SDLoc DL(N);
55843 MVT ShiftVT = SrcVT;
55844 SDValue ShiftLHS = Src.getOperand(0);
55845 SDValue ShiftRHS = Src.getOperand(1);
55846 if (ShiftVT.getScalarType() == MVT::i8) {
55847 // vXi8 shifts - we only care about the signbit so can use PSLLW.
55848 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
55849 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
55850 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
55851 }
55852 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55853 ShiftLHS, ShiftAmt, DAG);
55854 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55855 ShiftRHS, ShiftAmt, DAG);
55856 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
55857 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
55858 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
55859 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
55860 }
55861 }
55862
55863 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
55864 if (N->isOnlyUserOf(Src.getNode())) {
55866 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
55867 APInt UndefElts;
55868 SmallVector<APInt, 32> EltBits;
55869 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
55870 UndefElts, EltBits)) {
55871 APInt Mask = APInt::getZero(NumBits);
55872 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
55873 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55874 Mask.setBit(Idx);
55875 }
55876 SDLoc DL(N);
55877 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
55878 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
55879 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
55880 DAG.getConstant(Mask, DL, VT));
55881 }
55882 }
55883 }
55884
55885 // Simplify the inputs.
55886 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55887 APInt DemandedMask(APInt::getAllOnes(NumBits));
55888 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55889 return SDValue(N, 0);
55890
55891 return SDValue();
55892}
55893
55896 const X86Subtarget &Subtarget) {
55897 MVT VT = N->getSimpleValueType(0);
55898 unsigned NumBits = VT.getScalarSizeInBits();
55899
55900 // Simplify the inputs.
55901 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55902 APInt DemandedMask(APInt::getAllOnes(NumBits));
55903 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55904 return SDValue(N, 0);
55905
55906 return SDValue();
55907}
55908
55911 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
55912 SDValue Mask = MemOp->getMask();
55913
55914 // With vector masks we only demand the upper bit of the mask.
55915 if (Mask.getScalarValueSizeInBits() != 1) {
55916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55917 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55918 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55919 if (N->getOpcode() != ISD::DELETED_NODE)
55920 DCI.AddToWorklist(N);
55921 return SDValue(N, 0);
55922 }
55923 }
55924
55925 return SDValue();
55926}
55927
55929 SDValue Index, SDValue Base, SDValue Scale,
55930 SelectionDAG &DAG) {
55931 SDLoc DL(GorS);
55932
55933 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
55934 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55935 Gather->getMask(), Base, Index, Scale } ;
55936 return DAG.getMaskedGather(Gather->getVTList(),
55937 Gather->getMemoryVT(), DL, Ops,
55938 Gather->getMemOperand(),
55939 Gather->getIndexType(),
55940 Gather->getExtensionType());
55941 }
55942 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
55943 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55944 Scatter->getMask(), Base, Index, Scale };
55945 return DAG.getMaskedScatter(Scatter->getVTList(),
55946 Scatter->getMemoryVT(), DL,
55947 Ops, Scatter->getMemOperand(),
55948 Scatter->getIndexType(),
55949 Scatter->isTruncatingStore());
55950}
55951
55954 SDLoc DL(N);
55955 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
55956 SDValue Index = GorS->getIndex();
55957 SDValue Base = GorS->getBasePtr();
55958 SDValue Scale = GorS->getScale();
55959 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55960
55961 if (DCI.isBeforeLegalize()) {
55962 unsigned IndexWidth = Index.getScalarValueSizeInBits();
55963
55964 // Shrink constant indices if they are larger than 32-bits.
55965 // Only do this before legalize types since v2i64 could become v2i32.
55966 // FIXME: We could check that the type is legal if we're after legalize
55967 // types, but then we would need to construct test cases where that happens.
55968 // FIXME: We could support more than just constant vectors, but we need to
55969 // careful with costing. A truncate that can be optimized out would be fine.
55970 // Otherwise we might only want to create a truncate if it avoids a split.
55971 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
55972 if (BV->isConstant() && IndexWidth > 32 &&
55973 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55974 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55975 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55976 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55977 }
55978 }
55979
55980 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
55981 // there are sufficient sign bits. Only do this before legalize types to
55982 // avoid creating illegal types in truncate.
55983 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
55984 Index.getOpcode() == ISD::ZERO_EXTEND) &&
55985 IndexWidth > 32 &&
55986 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
55987 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55988 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55989 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55990 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55991 }
55992 }
55993
55994 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
55995 // Try to move splat constant adders from the index operand to the base
55996 // pointer operand. Taking care to multiply by the scale. We can only do
55997 // this when index element type is the same as the pointer type.
55998 // Otherwise we need to be sure the math doesn't wrap before the scale.
55999 if (Index.getOpcode() == ISD::ADD &&
56000 Index.getValueType().getVectorElementType() == PtrVT &&
56001 isa<ConstantSDNode>(Scale)) {
56002 uint64_t ScaleAmt = Scale->getAsZExtVal();
56003 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
56004 BitVector UndefElts;
56005 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
56006 // FIXME: Allow non-constant?
56007 if (UndefElts.none()) {
56008 // Apply the scale.
56009 APInt Adder = C->getAPIntValue() * ScaleAmt;
56010 // Add it to the existing base.
56011 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56012 DAG.getConstant(Adder, DL, PtrVT));
56013 Index = Index.getOperand(0);
56014 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56015 }
56016 }
56017
56018 // It's also possible base is just a constant. In that case, just
56019 // replace it with 0 and move the displacement into the index.
56020 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
56021 isOneConstant(Scale)) {
56022 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
56023 // Combine the constant build_vector and the constant base.
56024 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
56025 Index.getOperand(1), Splat);
56026 // Add to the LHS of the original Index add.
56027 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
56028 Index.getOperand(0), Splat);
56029 Base = DAG.getConstant(0, DL, Base.getValueType());
56030 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56031 }
56032 }
56033 }
56034
56035 if (DCI.isBeforeLegalizeOps()) {
56036 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56037
56038 // Make sure the index is either i32 or i64
56039 if (IndexWidth != 32 && IndexWidth != 64) {
56040 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56041 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
56042 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56043 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56044 }
56045 }
56046
56047 // With vector masks we only demand the upper bit of the mask.
56048 SDValue Mask = GorS->getMask();
56049 if (Mask.getScalarValueSizeInBits() != 1) {
56050 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56051 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56052 if (N->getOpcode() != ISD::DELETED_NODE)
56053 DCI.AddToWorklist(N);
56054 return SDValue(N, 0);
56055 }
56056 }
56057
56058 return SDValue();
56059}
56060
56061// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
56063 const X86Subtarget &Subtarget) {
56064 SDLoc DL(N);
56065 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56066 SDValue EFLAGS = N->getOperand(1);
56067
56068 // Try to simplify the EFLAGS and condition code operands.
56069 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
56070 return getSETCC(CC, Flags, DL, DAG);
56071
56072 return SDValue();
56073}
56074
56075/// Optimize branch condition evaluation.
56077 const X86Subtarget &Subtarget) {
56078 SDLoc DL(N);
56079 SDValue EFLAGS = N->getOperand(3);
56080 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56081
56082 // Try to simplify the EFLAGS and condition code operands.
56083 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
56084 // RAUW them under us.
56085 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
56086 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
56087 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56088 N->getOperand(1), Cond, Flags);
56089 }
56090
56091 return SDValue();
56092}
56093
56094// TODO: Could we move this to DAGCombine?
56096 SelectionDAG &DAG) {
56097 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
56098 // to optimize away operation when it's from a constant.
56099 //
56100 // The general transformation is:
56101 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
56102 // AND(VECTOR_CMP(x,y), constant2)
56103 // constant2 = UNARYOP(constant)
56104
56105 // Early exit if this isn't a vector operation, the operand of the
56106 // unary operation isn't a bitwise AND, or if the sizes of the operations
56107 // aren't the same.
56108 EVT VT = N->getValueType(0);
56109 bool IsStrict = N->isStrictFPOpcode();
56110 unsigned NumEltBits = VT.getScalarSizeInBits();
56111 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56112 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
56113 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
56114 VT.getSizeInBits() != Op0.getValueSizeInBits())
56115 return SDValue();
56116
56117 // Now check that the other operand of the AND is a constant. We could
56118 // make the transformation for non-constant splats as well, but it's unclear
56119 // that would be a benefit as it would not eliminate any operations, just
56120 // perform one more step in scalar code before moving to the vector unit.
56121 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
56122 // Bail out if the vector isn't a constant.
56123 if (!BV->isConstant())
56124 return SDValue();
56125
56126 // Everything checks out. Build up the new and improved node.
56127 SDLoc DL(N);
56128 EVT IntVT = BV->getValueType(0);
56129 // Create a new constant of the appropriate type for the transformed
56130 // DAG.
56131 SDValue SourceConst;
56132 if (IsStrict)
56133 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
56134 {N->getOperand(0), SDValue(BV, 0)});
56135 else
56136 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
56137 // The AND node needs bitcasts to/from an integer vector type around it.
56138 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
56139 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
56140 MaskConst);
56141 SDValue Res = DAG.getBitcast(VT, NewAnd);
56142 if (IsStrict)
56143 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
56144 return Res;
56145 }
56146
56147 return SDValue();
56148}
56149
56150/// If we are converting a value to floating-point, try to replace scalar
56151/// truncate of an extracted vector element with a bitcast. This tries to keep
56152/// the sequence on XMM registers rather than moving between vector and GPRs.
56154 // TODO: This is currently only used by combineSIntToFP, but it is generalized
56155 // to allow being called by any similar cast opcode.
56156 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
56157 SDValue Trunc = N->getOperand(0);
56158 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
56159 return SDValue();
56160
56161 SDValue ExtElt = Trunc.getOperand(0);
56162 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56163 !isNullConstant(ExtElt.getOperand(1)))
56164 return SDValue();
56165
56166 EVT TruncVT = Trunc.getValueType();
56167 EVT SrcVT = ExtElt.getValueType();
56168 unsigned DestWidth = TruncVT.getSizeInBits();
56169 unsigned SrcWidth = SrcVT.getSizeInBits();
56170 if (SrcWidth % DestWidth != 0)
56171 return SDValue();
56172
56173 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
56174 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
56175 unsigned VecWidth = SrcVecVT.getSizeInBits();
56176 unsigned NumElts = VecWidth / DestWidth;
56177 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
56178 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
56179 SDLoc DL(N);
56180 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
56181 BitcastVec, ExtElt.getOperand(1));
56182 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
56183}
56184
56186 const X86Subtarget &Subtarget) {
56187 bool IsStrict = N->isStrictFPOpcode();
56188 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56189 EVT VT = N->getValueType(0);
56190 EVT InVT = Op0.getValueType();
56191
56192 // Using i16 as an intermediate type is a bad idea, unless we have HW support
56193 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
56194 // if hasFP16 support:
56195 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
56196 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
56197 // else
56198 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56199 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
56200 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
56201 unsigned ScalarSize = InVT.getScalarSizeInBits();
56202 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
56203 ScalarSize >= 64)
56204 return SDValue();
56205 SDLoc dl(N);
56206 EVT DstVT =
56208 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
56209 : ScalarSize < 32 ? MVT::i32
56210 : MVT::i64,
56211 InVT.getVectorNumElements());
56212 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
56213 if (IsStrict)
56214 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56215 {N->getOperand(0), P});
56216 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56217 }
56218
56219 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
56220 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
56221 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
56222 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
56223 VT.getScalarType() != MVT::f16) {
56224 SDLoc dl(N);
56225 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
56226 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
56227
56228 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
56229 if (IsStrict)
56230 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56231 {N->getOperand(0), P});
56232 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56233 }
56234
56235 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
56236 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
56237 // the optimization here.
56238 SDNodeFlags Flags = N->getFlags();
56239 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
56240 if (IsStrict)
56241 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
56242 {N->getOperand(0), Op0});
56243 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
56244 }
56245
56246 return SDValue();
56247}
56248
56251 const X86Subtarget &Subtarget) {
56252 // First try to optimize away the conversion entirely when it's
56253 // conditionally from a constant. Vectors only.
56254 bool IsStrict = N->isStrictFPOpcode();
56256 return Res;
56257
56258 // Now move on to more general possibilities.
56259 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
56260 EVT VT = N->getValueType(0);
56261 EVT InVT = Op0.getValueType();
56262
56263 // Using i16 as an intermediate type is a bad idea, unless we have HW support
56264 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
56265 // if hasFP16 support:
56266 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
56267 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
56268 // else
56269 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
56270 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
56271 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
56272 unsigned ScalarSize = InVT.getScalarSizeInBits();
56273 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
56274 ScalarSize >= 64)
56275 return SDValue();
56276 SDLoc dl(N);
56277 EVT DstVT =
56279 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
56280 : ScalarSize < 32 ? MVT::i32
56281 : MVT::i64,
56282 InVT.getVectorNumElements());
56283 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
56284 if (IsStrict)
56285 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56286 {N->getOperand(0), P});
56287 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56288 }
56289
56290 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
56291 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
56292 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
56293 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
56294 VT.getScalarType() != MVT::f16) {
56295 SDLoc dl(N);
56296 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
56297 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
56298 if (IsStrict)
56299 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56300 {N->getOperand(0), P});
56301 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
56302 }
56303
56304 // Without AVX512DQ we only support i64 to float scalar conversion. For both
56305 // vectors and scalars, see if we know that the upper bits are all the sign
56306 // bit, in which case we can truncate the input to i32 and convert from that.
56307 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
56308 unsigned BitWidth = InVT.getScalarSizeInBits();
56309 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
56310 if (NumSignBits >= (BitWidth - 31)) {
56311 EVT TruncVT = MVT::i32;
56312 if (InVT.isVector())
56313 TruncVT = InVT.changeVectorElementType(TruncVT);
56314 SDLoc dl(N);
56315 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
56316 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
56317 if (IsStrict)
56318 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
56319 {N->getOperand(0), Trunc});
56320 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
56321 }
56322 // If we're after legalize and the type is v2i32 we need to shuffle and
56323 // use CVTSI2P.
56324 assert(InVT == MVT::v2i64 && "Unexpected VT!");
56325 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
56326 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
56327 { 0, 2, -1, -1 });
56328 if (IsStrict)
56329 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
56330 {N->getOperand(0), Shuf});
56331 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
56332 }
56333 }
56334
56335 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
56336 // a 32-bit target where SSE doesn't support i64->FP operations.
56337 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
56338 Op0.getOpcode() == ISD::LOAD) {
56339 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
56340
56341 // This transformation is not supported if the result type is f16 or f128.
56342 if (VT == MVT::f16 || VT == MVT::f128)
56343 return SDValue();
56344
56345 // If we have AVX512DQ we can use packed conversion instructions unless
56346 // the VT is f80.
56347 if (Subtarget.hasDQI() && VT != MVT::f80)
56348 return SDValue();
56349
56350 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
56351 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
56352 std::pair<SDValue, SDValue> Tmp =
56353 Subtarget.getTargetLowering()->BuildFILD(
56354 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
56355 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
56356 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
56357 return Tmp.first;
56358 }
56359 }
56360
56361 if (IsStrict)
56362 return SDValue();
56363
56364 if (SDValue V = combineToFPTruncExtElt(N, DAG))
56365 return V;
56366
56367 return SDValue();
56368}
56369
56370// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
56372 const X86Subtarget &Subtarget) {
56373 if (!Subtarget.hasAVX10_2())
56374 return SDValue();
56375
56376 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
56377 EVT SrcVT = N->getOperand(0).getValueType();
56378 EVT DstVT = N->getValueType(0);
56379 SDLoc dl(N);
56380
56381 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
56382 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
56383
56384 // Concatenate the original v2f32 input and V2F32Value to create v4f32
56385 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
56386 N->getOperand(0), V2F32Value);
56387
56388 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
56389 if (IsSigned)
56390 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
56391
56392 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
56393 }
56394 return SDValue();
56395}
56396
56398 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
56399
56400 for (const SDNode *User : Flags->users()) {
56402 switch (User->getOpcode()) {
56403 default:
56404 // Be conservative.
56405 return true;
56406 case X86ISD::SETCC:
56408 CC = (X86::CondCode)User->getConstantOperandVal(0);
56409 break;
56410 case X86ISD::BRCOND:
56411 case X86ISD::CMOV:
56412 CC = (X86::CondCode)User->getConstantOperandVal(2);
56413 break;
56414 }
56415
56416 switch (CC) {
56417 // clang-format off
56418 default: break;
56419 case X86::COND_A: case X86::COND_AE:
56420 case X86::COND_B: case X86::COND_BE:
56421 case X86::COND_O: case X86::COND_NO:
56422 case X86::COND_G: case X86::COND_GE:
56423 case X86::COND_L: case X86::COND_LE:
56424 return true;
56425 // clang-format on
56426 }
56427 }
56428
56429 return false;
56430}
56431
56432static bool onlyZeroFlagUsed(SDValue Flags) {
56433 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
56434
56435 for (const SDNode *User : Flags->users()) {
56436 unsigned CCOpNo;
56437 switch (User->getOpcode()) {
56438 default:
56439 // Be conservative.
56440 return false;
56441 case X86ISD::SETCC:
56443 CCOpNo = 0;
56444 break;
56445 case X86ISD::BRCOND:
56446 case X86ISD::CMOV:
56447 CCOpNo = 2;
56448 break;
56449 }
56450
56451 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
56452 if (CC != X86::COND_E && CC != X86::COND_NE)
56453 return false;
56454 }
56455
56456 return true;
56457}
56458
56461 const X86Subtarget &Subtarget) {
56462 // Only handle test patterns.
56463 if (!isNullConstant(N->getOperand(1)))
56464 return SDValue();
56465
56466 // If we have a CMP of a truncated binop, see if we can make a smaller binop
56467 // and use its flags directly.
56468 // TODO: Maybe we should try promoting compares that only use the zero flag
56469 // first if we can prove the upper bits with computeKnownBits?
56470 SDLoc dl(N);
56471 SDValue Op = N->getOperand(0);
56472 EVT VT = Op.getValueType();
56473 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56474
56475 if (SDValue CMP =
56476 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
56477 return CMP;
56478
56479 // If we have a constant logical shift that's only used in a comparison
56480 // against zero turn it into an equivalent AND. This allows turning it into
56481 // a TEST instruction later.
56482 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
56483 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
56484 onlyZeroFlagUsed(SDValue(N, 0))) {
56485 unsigned BitWidth = VT.getSizeInBits();
56486 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
56487 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
56488 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
56489 APInt Mask = Op.getOpcode() == ISD::SRL
56490 ? APInt::getHighBitsSet(BitWidth, MaskBits)
56491 : APInt::getLowBitsSet(BitWidth, MaskBits);
56492 if (Mask.isSignedIntN(32)) {
56493 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
56494 DAG.getConstant(Mask, dl, VT));
56495 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56496 DAG.getConstant(0, dl, VT));
56497 }
56498 }
56499 }
56500
56501 // If we're extracting from a avx512 bool vector and comparing against zero,
56502 // then try to just bitcast the vector to an integer to use TEST/BT directly.
56503 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
56504 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
56505 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
56506 SDValue Src = Op.getOperand(0);
56507 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56508 isNullConstant(Src.getOperand(1)) &&
56509 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
56510 SDValue BoolVec = Src.getOperand(0);
56511 unsigned ShAmt = 0;
56512 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
56513 ShAmt = BoolVec.getConstantOperandVal(1);
56514 BoolVec = BoolVec.getOperand(0);
56515 }
56516 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
56517 EVT VecVT = BoolVec.getValueType();
56518 unsigned BitWidth = VecVT.getVectorNumElements();
56519 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
56520 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
56521 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
56522 Op = DAG.getBitcast(BCVT, BoolVec);
56523 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
56524 DAG.getConstant(Mask, dl, BCVT));
56525 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56526 DAG.getConstant(0, dl, BCVT));
56527 }
56528 }
56529 }
56530
56531 // Peek through any zero-extend if we're only testing for a zero result.
56532 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
56533 SDValue Src = Op.getOperand(0);
56534 EVT SrcVT = Src.getValueType();
56535 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
56536 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
56537 DAG.getConstant(0, dl, SrcVT));
56538 }
56539
56540 // Look for a truncate.
56541 if (Op.getOpcode() != ISD::TRUNCATE)
56542 return SDValue();
56543
56544 SDValue Trunc = Op;
56545 Op = Op.getOperand(0);
56546
56547 // See if we can compare with zero against the truncation source,
56548 // which should help using the Z flag from many ops. Only do this for
56549 // i32 truncated op to prevent partial-reg compares of promoted ops.
56550 EVT OpVT = Op.getValueType();
56551 APInt UpperBits =
56553 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
56554 onlyZeroFlagUsed(SDValue(N, 0))) {
56555 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56556 DAG.getConstant(0, dl, OpVT));
56557 }
56558
56559 // After this the truncate and arithmetic op must have a single use.
56560 if (!Trunc.hasOneUse() || !Op.hasOneUse())
56561 return SDValue();
56562
56563 unsigned NewOpc;
56564 switch (Op.getOpcode()) {
56565 default: return SDValue();
56566 case ISD::AND:
56567 // Skip and with constant. We have special handling for and with immediate
56568 // during isel to generate test instructions.
56569 if (isa<ConstantSDNode>(Op.getOperand(1)))
56570 return SDValue();
56571 NewOpc = X86ISD::AND;
56572 break;
56573 case ISD::OR: NewOpc = X86ISD::OR; break;
56574 case ISD::XOR: NewOpc = X86ISD::XOR; break;
56575 case ISD::ADD:
56576 // If the carry or overflow flag is used, we can't truncate.
56578 return SDValue();
56579 NewOpc = X86ISD::ADD;
56580 break;
56581 case ISD::SUB:
56582 // If the carry or overflow flag is used, we can't truncate.
56584 return SDValue();
56585 NewOpc = X86ISD::SUB;
56586 break;
56587 }
56588
56589 // We found an op we can narrow. Truncate its inputs.
56590 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
56591 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
56592
56593 // Use a X86 specific opcode to avoid DAG combine messing with it.
56594 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56595 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
56596
56597 // For AND, keep a CMP so that we can match the test pattern.
56598 if (NewOpc == X86ISD::AND)
56599 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56600 DAG.getConstant(0, dl, VT));
56601
56602 // Return the flags.
56603 return Op.getValue(1);
56604}
56605
56608 const X86Subtarget &ST) {
56609 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
56610 "Expected X86ISD::ADD or X86ISD::SUB");
56611
56612 SDLoc DL(N);
56613 SDValue LHS = N->getOperand(0);
56614 SDValue RHS = N->getOperand(1);
56615 MVT VT = LHS.getSimpleValueType();
56616 bool IsSub = X86ISD::SUB == N->getOpcode();
56617 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
56618
56619 if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0))
56620 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
56621 return CMP;
56622
56623 // If we don't use the flag result, simplify back to a generic ADD/SUB.
56624 if (!N->hasAnyUseOfValue(1)) {
56625 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
56626 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
56627 }
56628
56629 // Fold any similar generic ADD/SUB opcodes to reuse this node.
56630 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
56631 SDValue Ops[] = {N0, N1};
56632 SDVTList VTs = DAG.getVTList(N->getValueType(0));
56633 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
56634 SDValue Op(N, 0);
56635 if (Negate) {
56636 // Bail if this is only used by a user of the x86 add/sub.
56637 if (GenericAddSub->hasOneUse() &&
56638 GenericAddSub->user_begin()->isOnlyUserOf(N))
56639 return;
56640 Op = DAG.getNegative(Op, DL, VT);
56641 }
56642 DCI.CombineTo(GenericAddSub, Op);
56643 }
56644 };
56645 MatchGeneric(LHS, RHS, false);
56646 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
56647
56648 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
56649 // EFLAGS result doesn't change.
56650 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
56651 /*ZeroSecondOpOnly*/ true);
56652}
56653
56655 SDValue LHS = N->getOperand(0);
56656 SDValue RHS = N->getOperand(1);
56657 SDValue BorrowIn = N->getOperand(2);
56658
56659 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
56660 MVT VT = N->getSimpleValueType(0);
56661 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56662 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
56663 }
56664
56665 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
56666 // iff the flag result is dead.
56667 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
56668 !N->hasAnyUseOfValue(1))
56669 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56670 LHS.getOperand(1), BorrowIn);
56671
56672 return SDValue();
56673}
56674
56675// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
56678 SDValue LHS = N->getOperand(0);
56679 SDValue RHS = N->getOperand(1);
56680 SDValue CarryIn = N->getOperand(2);
56681 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
56682 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
56683
56684 // Canonicalize constant to RHS.
56685 if (LHSC && !RHSC)
56686 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
56687 CarryIn);
56688
56689 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
56690 // the result is either zero or one (depending on the input carry bit).
56691 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
56692 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
56693 // We don't have a good way to replace an EFLAGS use, so only do this when
56694 // dead right now.
56695 SDValue(N, 1).use_empty()) {
56696 SDLoc DL(N);
56697 EVT VT = N->getValueType(0);
56698 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
56699 SDValue Res1 = DAG.getNode(
56700 ISD::AND, DL, VT,
56702 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
56703 DAG.getConstant(1, DL, VT));
56704 return DCI.CombineTo(N, Res1, CarryOut);
56705 }
56706
56707 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
56708 // iff the flag result is dead.
56709 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
56710 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
56711 SDLoc DL(N);
56712 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
56713 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
56714 DAG.getConstant(0, DL, LHS.getValueType()),
56715 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
56716 }
56717
56718 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
56719 MVT VT = N->getSimpleValueType(0);
56720 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56721 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
56722 }
56723
56724 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
56725 // iff the flag result is dead.
56726 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
56727 !N->hasAnyUseOfValue(1))
56728 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56729 LHS.getOperand(1), CarryIn);
56730
56731 return SDValue();
56732}
56733
56735 const SDLoc &DL, EVT VT,
56736 const X86Subtarget &Subtarget) {
56737 using namespace SDPatternMatch;
56738
56739 // Example of pattern we try to detect:
56740 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
56741 //(add (build_vector (extract_elt t, 0),
56742 // (extract_elt t, 2),
56743 // (extract_elt t, 4),
56744 // (extract_elt t, 6)),
56745 // (build_vector (extract_elt t, 1),
56746 // (extract_elt t, 3),
56747 // (extract_elt t, 5),
56748 // (extract_elt t, 7)))
56749
56750 if (!Subtarget.hasSSE2())
56751 return SDValue();
56752
56753 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56754 VT.getVectorNumElements() < 4 ||
56756 return SDValue();
56757
56758 SDValue Op0, Op1, Accum;
56759 if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56760 m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&
56761 !sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56762 m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),
56763 m_Value(Op1))))))
56764 return SDValue();
56765
56766 // Check if one of Op0,Op1 is of the form:
56767 // (build_vector (extract_elt Mul, 0),
56768 // (extract_elt Mul, 2),
56769 // (extract_elt Mul, 4),
56770 // ...
56771 // the other is of the form:
56772 // (build_vector (extract_elt Mul, 1),
56773 // (extract_elt Mul, 3),
56774 // (extract_elt Mul, 5),
56775 // ...
56776 // and identify Mul.
56777 SDValue Mul;
56778 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
56779 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
56780 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
56781 // TODO: Be more tolerant to undefs.
56782 APInt Idx0L, Idx0H, Idx1L, Idx1H;
56783 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
56784 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
56785 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
56786 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
56787 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
56788 return SDValue();
56789 // Commutativity of mul allows factors of a product to reorder.
56790 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
56791 std::swap(Idx0L, Idx1L);
56792 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
56793 std::swap(Idx0H, Idx1H);
56794 // Commutativity of add allows pairs of factors to reorder.
56795 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
56796 std::swap(Idx0L, Idx0H);
56797 std::swap(Idx1L, Idx1H);
56798 }
56799 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
56800 Idx1H != 2 * i + 3)
56801 return SDValue();
56802 if (!Mul) {
56803 // First time an extract_elt's source vector is visited. Must be a MUL
56804 // with 2X number of vector elements than the BUILD_VECTOR.
56805 // Both extracts must be from same MUL.
56806 Mul = Vec0L;
56807 if (Mul.getOpcode() != ISD::MUL ||
56808 Mul.getValueType().getVectorNumElements() != 2 * e)
56809 return SDValue();
56810 }
56811 // Check that the extract is from the same MUL previously seen.
56812 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
56813 return SDValue();
56814 }
56815
56816 // Check if the Mul source can be safely shrunk.
56817 ShrinkMode Mode;
56818 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
56819 Mode == ShrinkMode::MULU16)
56820 return SDValue();
56821
56822 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56823 VT.getVectorNumElements() * 2);
56824 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
56825 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
56826
56827 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56828 ArrayRef<SDValue> Ops) {
56829 EVT InVT = Ops[0].getValueType();
56830 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
56831 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56832 InVT.getVectorNumElements() / 2);
56833 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56834 };
56835 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
56836 if (Accum)
56837 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
56838 return R;
56839}
56840
56841// Attempt to turn this pattern into PMADDWD.
56842// (add (mul (sext (build_vector)), (sext (build_vector))),
56843// (mul (sext (build_vector)), (sext (build_vector)))
56845 const SDLoc &DL, EVT VT,
56846 const X86Subtarget &Subtarget) {
56847 using namespace SDPatternMatch;
56848
56849 if (!Subtarget.hasSSE2())
56850 return SDValue();
56851
56852 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56853 VT.getVectorNumElements() < 4 ||
56855 return SDValue();
56856
56857 // All inputs need to be sign extends.
56858 // TODO: Support ZERO_EXTEND from known positive?
56859 SDValue N00, N01, N10, N11;
56860 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
56861 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
56862 return SDValue();
56863
56864 // Must be extending from vXi16.
56865 EVT InVT = N00.getValueType();
56866 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
56867 N10.getValueType() != InVT || N11.getValueType() != InVT)
56868 return SDValue();
56869
56870 // All inputs should be build_vectors.
56871 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
56872 N01.getOpcode() != ISD::BUILD_VECTOR ||
56873 N10.getOpcode() != ISD::BUILD_VECTOR ||
56875 return SDValue();
56876
56877 // For each element, we need to ensure we have an odd element from one vector
56878 // multiplied by the odd element of another vector and the even element from
56879 // one of the same vectors being multiplied by the even element from the
56880 // other vector. So we need to make sure for each element i, this operator
56881 // is being performed:
56882 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
56883 SDValue In0, In1;
56884 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
56885 SDValue N00Elt = N00.getOperand(i);
56886 SDValue N01Elt = N01.getOperand(i);
56887 SDValue N10Elt = N10.getOperand(i);
56888 SDValue N11Elt = N11.getOperand(i);
56889 // TODO: Be more tolerant to undefs.
56890 SDValue N00In, N01In, N10In, N11In;
56891 APInt IdxN00, IdxN01, IdxN10, IdxN11;
56892 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
56893 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
56894 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
56895 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
56896 return SDValue();
56897 // Add is commutative so indices can be reordered.
56898 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
56899 std::swap(IdxN00, IdxN10);
56900 std::swap(IdxN01, IdxN11);
56901 }
56902 // N0 indices be the even element. N1 indices must be the next odd element.
56903 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
56904 IdxN11 != 2 * i + 1)
56905 return SDValue();
56906
56907 // First time we find an input capture it.
56908 if (!In0) {
56909 In0 = N00In;
56910 In1 = N01In;
56911
56912 // The input vectors must be at least as wide as the output.
56913 // If they are larger than the output, we extract subvector below.
56914 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
56915 In1.getValueSizeInBits() < VT.getSizeInBits())
56916 return SDValue();
56917 }
56918 // Mul is commutative so the input vectors can be in any order.
56919 // Canonicalize to make the compares easier.
56920 if (In0 != N00In)
56921 std::swap(N00In, N01In);
56922 if (In0 != N10In)
56923 std::swap(N10In, N11In);
56924 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
56925 return SDValue();
56926 }
56927
56928 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56929 ArrayRef<SDValue> Ops) {
56930 EVT OpVT = Ops[0].getValueType();
56931 assert(OpVT.getScalarType() == MVT::i16 &&
56932 "Unexpected scalar element type");
56933 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
56934 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56935 OpVT.getVectorNumElements() / 2);
56936 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56937 };
56938
56939 // If the output is narrower than an input, extract the low part of the input
56940 // vector.
56941 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56942 VT.getVectorNumElements() * 2);
56943 if (OutVT16.bitsLT(In0.getValueType())) {
56944 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
56945 DAG.getVectorIdxConstant(0, DL));
56946 }
56947 if (OutVT16.bitsLT(In1.getValueType())) {
56948 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
56949 DAG.getVectorIdxConstant(0, DL));
56950 }
56951 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
56952 PMADDBuilder);
56953}
56954
56955// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
56956// If upper element in each pair of both VPMADDWD are zero then we can merge
56957// the operand elements and use the implicit add of VPMADDWD.
56958// TODO: Add support for VPMADDUBSW (which isn't commutable).
56960 const SDLoc &DL, EVT VT) {
56961 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
56962 return SDValue();
56963
56964 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
56965 if (VT.getSizeInBits() > 128)
56966 return SDValue();
56967
56968 unsigned NumElts = VT.getVectorNumElements();
56969 MVT OpVT = N0.getOperand(0).getSimpleValueType();
56971 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
56972
56973 bool Op0HiZero =
56974 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
56975 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
56976 bool Op1HiZero =
56977 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
56978 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
56979
56980 // TODO: Check for zero lower elements once we have actual codegen that
56981 // creates them.
56982 if (!Op0HiZero || !Op1HiZero)
56983 return SDValue();
56984
56985 // Create a shuffle mask packing the lower elements from each VPMADDWD.
56986 SmallVector<int> Mask;
56987 for (int i = 0; i != (int)NumElts; ++i) {
56988 Mask.push_back(2 * i);
56989 Mask.push_back(2 * (i + NumElts));
56990 }
56991
56992 SDValue LHS =
56993 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
56994 SDValue RHS =
56995 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
56996 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
56997}
56998
56999/// CMOV of constants requires materializing constant operands in registers.
57000/// Try to fold those constants into an 'add' instruction to reduce instruction
57001/// count. We do this with CMOV rather the generic 'select' because there are
57002/// earlier folds that may be used to turn select-of-constants into logic hacks.
57004 SelectionDAG &DAG,
57005 const X86Subtarget &Subtarget) {
57006 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57007 // better because we eliminate 1-2 instructions. This transform is still
57008 // an improvement without zero operands because we trade 2 move constants and
57009 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57010 // immediate asm operands (fit in 32-bits).
57011 auto isSuitableCmov = [](SDValue V) {
57012 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57013 return false;
57014 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57015 !isa<ConstantSDNode>(V.getOperand(1)))
57016 return false;
57017 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57018 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57019 V.getConstantOperandAPInt(1).isSignedIntN(32));
57020 };
57021
57022 // Match an appropriate CMOV as the first operand of the add.
57023 SDValue Cmov = N->getOperand(0);
57024 SDValue OtherOp = N->getOperand(1);
57025 if (!isSuitableCmov(Cmov))
57026 std::swap(Cmov, OtherOp);
57027 if (!isSuitableCmov(Cmov))
57028 return SDValue();
57029
57030 // Don't remove a load folding opportunity for the add. That would neutralize
57031 // any improvements from removing constant materializations.
57032 if (X86::mayFoldLoad(OtherOp, Subtarget))
57033 return SDValue();
57034
57035 EVT VT = N->getValueType(0);
57036 SDValue FalseOp = Cmov.getOperand(0);
57037 SDValue TrueOp = Cmov.getOperand(1);
57038
57039 // We will push the add through the select, but we can potentially do better
57040 // if we know there is another add in the sequence and this is pointer math.
57041 // In that case, we can absorb an add into the trailing memory op and avoid
57042 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57043 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57044 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57045 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57046 all_of(N->users(), [&](SDNode *Use) {
57047 auto *MemNode = dyn_cast<MemSDNode>(Use);
57048 return MemNode && MemNode->getBasePtr().getNode() == N;
57049 })) {
57050 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57051 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
57052 // it is possible that choosing op1 might be better.
57053 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
57054 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
57055 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
57056 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
57057 Cmov.getOperand(2), Cmov.getOperand(3));
57058 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
57059 }
57060
57061 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57062 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
57063 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
57064 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
57065 Cmov.getOperand(3));
57066}
57067
57070 const X86Subtarget &Subtarget) {
57071 EVT VT = N->getValueType(0);
57072 SDValue Op0 = N->getOperand(0);
57073 SDValue Op1 = N->getOperand(1);
57074 SDLoc DL(N);
57075
57076 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
57077 return Select;
57078
57079 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
57080 return MAdd;
57081 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
57082 return MAdd;
57083 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
57084 return MAdd;
57085
57086 // Try to synthesize horizontal adds from adds of shuffles.
57087 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57088 return V;
57089
57090 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
57091 // iff X and Y won't overflow.
57092 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
57095 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
57096 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
57097 SDValue Sum =
57098 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
57099 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
57100 getZeroVector(OpVT, Subtarget, DAG, DL));
57101 }
57102 }
57103
57104 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
57105 // (sub Y, (sext (vXi1 X))).
57106 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
57107 // generic DAG combine without a legal type check, but adding this there
57108 // caused regressions.
57109 if (VT.isVector()) {
57110 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57111 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
57112 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
57113 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
57114 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
57115 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
57116 }
57117
57118 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
57119 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
57120 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
57121 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
57122 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
57123 }
57124 }
57125
57126 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
57127 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
57128 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
57129 using namespace SDPatternMatch;
57130 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
57131 if (sd_match(N, m_Add(m_Value(Accum),
57132 m_Node(ISD::CONCAT_VECTORS,
57134 m_Value(Lo1)),
57136 m_Value(Hi1)))))) {
57137 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
57138 concatSubVectors(Lo0, Hi0, DAG, DL),
57139 concatSubVectors(Lo1, Hi1, DAG, DL));
57140 }
57141 }
57142
57143 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
57144 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
57145 X86::isZeroNode(Op0.getOperand(1))) {
57146 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
57147 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
57148 Op0.getOperand(0), Op0.getOperand(2));
57149 }
57150
57151 return combineAddOrSubToADCOrSBB(N, DL, DAG);
57152}
57153
57154// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
57155// condition comes from the subtract node that produced -X. This matches the
57156// cmov expansion for absolute value. By swapping the operands we convert abs
57157// to nabs.
57158static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
57159 SelectionDAG &DAG) {
57160 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
57161 return SDValue();
57162
57163 SDValue Cond = N1.getOperand(3);
57164 if (Cond.getOpcode() != X86ISD::SUB)
57165 return SDValue();
57166 assert(Cond.getResNo() == 1 && "Unexpected result number");
57167
57168 SDValue FalseOp = N1.getOperand(0);
57169 SDValue TrueOp = N1.getOperand(1);
57171
57172 // ABS condition should come from a negate operation.
57173 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
57174 isNullConstant(Cond.getOperand(0))) {
57175 // Get the X and -X from the negate.
57176 SDValue NegX = Cond.getValue(0);
57177 SDValue X = Cond.getOperand(1);
57178
57179 // Cmov operands should be X and NegX. Order doesn't matter.
57180 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
57181 return SDValue();
57182
57183 // Build a new CMOV with the operands swapped.
57184 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
57185 N1.getOperand(2), Cond);
57186 // Convert sub to add.
57187 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
57188 }
57189
57190 // Handle ABD special case:
57191 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
57192 // ABD condition should come from a pair of matching subtracts.
57193 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
57194 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
57195 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
57196 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
57197 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
57198 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
57199 // Build a new CMOV with the operands swapped.
57200 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
57201 Cond);
57202 }
57203
57204 return SDValue();
57205}
57206
57208 SDValue Op0 = N->getOperand(0);
57209 SDValue Op1 = N->getOperand(1);
57210
57211 // (sub C (zero_extend (setcc)))
57212 // =>
57213 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
57214 // Don't disturb (sub 0 setcc), which is easily done with neg.
57215 EVT VT = N->getValueType(0);
57216 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
57217 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
57218 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
57219 Op1.getOperand(0).hasOneUse()) {
57220 SDValue SetCC = Op1.getOperand(0);
57223 APInt NewImm = Op0C->getAPIntValue() - 1;
57224 SDLoc DL(Op1);
57225 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
57226 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
57227 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
57228 DAG.getConstant(NewImm, DL, VT));
57229 }
57230
57231 return SDValue();
57232}
57233
57235 // res, flags2 = sub 0, (setcc cc, flag)
57236 // cload/cstore ..., cond_ne, flag2
57237 // ->
57238 // cload/cstore cc, flag
57239 if (N->getConstantOperandVal(3) != X86::COND_NE)
57240 return SDValue();
57241
57242 SDValue Sub = N->getOperand(4);
57243 if (Sub.getOpcode() != X86ISD::SUB)
57244 return SDValue();
57245
57246 SDValue SetCC = Sub.getOperand(1);
57247
57248 if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC)
57249 return SDValue();
57250
57251 SmallVector<SDValue, 5> Ops(N->op_values());
57252 Ops[3] = SetCC.getOperand(0);
57253 Ops[4] = SetCC.getOperand(1);
57254
57255 return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
57256 cast<MemSDNode>(N)->getMemoryVT(),
57257 cast<MemSDNode>(N)->getMemOperand());
57258}
57259
57262 const X86Subtarget &Subtarget) {
57263 EVT VT = N->getValueType(0);
57264 SDValue Op0 = N->getOperand(0);
57265 SDValue Op1 = N->getOperand(1);
57266 SDLoc DL(N);
57267
57268 auto IsNonOpaqueConstant = [&](SDValue Op) {
57270 /*AllowOpaques*/ false);
57271 };
57272
57273 // X86 can't encode an immediate LHS of a sub. See if we can push the
57274 // negation into a preceding instruction. If the RHS of the sub is a XOR with
57275 // one use and a constant, invert the immediate, saving one register.
57276 // However, ignore cases where C1 is 0, as those will become a NEG.
57277 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
57278 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
57279 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
57280 Op1->hasOneUse()) {
57281 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
57282 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
57283 SDValue NewAdd =
57284 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
57285 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
57286 }
57287
57288 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
57289 return V;
57290
57291 // Try to synthesize horizontal subs from subs of shuffles.
57292 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
57293 return V;
57294
57295 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
57296 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
57297 X86::isZeroNode(Op1.getOperand(1))) {
57298 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57299 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
57300 Op1.getOperand(0), Op1.getOperand(2));
57301 }
57302
57303 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
57304 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
57305 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
57306 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
57307 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
57308 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
57309 Op1.getOperand(1), Op1.getOperand(2));
57310 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
57311 }
57312
57313 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
57314 return V;
57315
57316 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
57317 return V;
57318
57319 return combineSubSetcc(N, DAG);
57320}
57321
57323 const X86Subtarget &Subtarget) {
57324 unsigned Opcode = N->getOpcode();
57325 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
57326 "Unknown PCMP opcode");
57327
57328 SDValue LHS = N->getOperand(0);
57329 SDValue RHS = N->getOperand(1);
57330 MVT VT = N->getSimpleValueType(0);
57331 unsigned EltBits = VT.getScalarSizeInBits();
57332 unsigned NumElts = VT.getVectorNumElements();
57333 SDLoc DL(N);
57334
57335 if (LHS == RHS)
57336 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
57337 : DAG.getConstant(0, DL, VT);
57338
57339 // Constant Folding.
57340 // PCMPEQ(X,UNDEF) -> UNDEF
57341 // PCMPGT(X,UNDEF) -> 0
57342 // PCMPGT(UNDEF,X) -> 0
57343 APInt LHSUndefs, RHSUndefs;
57344 SmallVector<APInt> LHSBits, RHSBits;
57345 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
57346 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
57347 APInt Ones = APInt::getAllOnes(EltBits);
57348 APInt Zero = APInt::getZero(EltBits);
57349 SmallVector<APInt> Results(NumElts);
57350 for (unsigned I = 0; I != NumElts; ++I) {
57351 if (Opcode == X86ISD::PCMPEQ) {
57352 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
57353 } else {
57354 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
57355 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
57356 }
57357 }
57358 if (Opcode == X86ISD::PCMPEQ)
57359 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
57360 return getConstVector(Results, VT, DAG, DL);
57361 }
57362
57363 return SDValue();
57364}
57365
57366// Helper to determine if we can convert an integer comparison to a float
57367// comparison byt casting the operands.
57368static std::optional<unsigned>
57369CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
57370 unsigned NumSignificantBitsRHS) {
57371 MVT SVT = VT.getScalarType();
57372 assert(SVT == MVT::f32 && "Only tested for float so far");
57373 const fltSemantics &Sem = SVT.getFltSemantics();
57374 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
57375 "Only PCMPEQ/PCMPGT currently supported");
57376
57377 // TODO: Handle bitcastable integers.
57378
57379 // For cvt + signed compare we need lhs and rhs to be exactly representable as
57380 // a fp value.
57381 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
57382 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
57383 return ISD::SINT_TO_FP;
57384
57385 return std::nullopt;
57386}
57387
57388/// Helper that combines an array of subvector ops as if they were the operands
57389/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
57390/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
57394 const X86Subtarget &Subtarget) {
57395 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
57396 unsigned EltSizeInBits = VT.getScalarSizeInBits();
57397
57398 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
57399 return DAG.getUNDEF(VT);
57400
57401 if (llvm::all_of(Ops, [](SDValue Op) {
57402 return ISD::isBuildVectorAllZeros(Op.getNode());
57403 }))
57404 return getZeroVector(VT, Subtarget, DAG, DL);
57405
57406 SDValue Op0 = Ops[0];
57407 bool IsSplat = llvm::all_equal(Ops);
57408 unsigned NumOps = Ops.size();
57409 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57410 LLVMContext &Ctx = *DAG.getContext();
57411
57412 // Repeated subvectors.
57413 if (IsSplat &&
57414 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
57415 // If this broadcast is inserted into both halves, use a larger broadcast.
57416 if (Op0.getOpcode() == X86ISD::VBROADCAST)
57417 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
57418
57419 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
57420 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
57421 (Subtarget.hasAVX2() ||
57423 VT.getScalarType(), Subtarget)))
57424 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
57425 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
57426 Op0.getOperand(0),
57427 DAG.getVectorIdxConstant(0, DL)));
57428
57429 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
57430 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
57431 (Subtarget.hasAVX2() ||
57432 (EltSizeInBits >= 32 &&
57433 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
57434 Op0.getOperand(0).getValueType() == VT.getScalarType())
57435 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
57436
57437 // concat_vectors(extract_subvector(broadcast(x)),
57438 // extract_subvector(broadcast(x))) -> broadcast(x)
57439 // concat_vectors(extract_subvector(subv_broadcast(x)),
57440 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
57441 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57442 Op0.getOperand(0).getValueType() == VT) {
57443 SDValue SrcVec = Op0.getOperand(0);
57444 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
57446 return Op0.getOperand(0);
57447 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
57448 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
57449 return Op0.getOperand(0);
57450 }
57451
57452 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
57453 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
57454 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
57455 return DAG.getNode(Op0.getOpcode(), DL, VT,
57457 Op0.getOperand(0), Op0.getOperand(0)),
57458 Op0.getOperand(1));
57459 }
57460
57461 // TODO: This should go in combineX86ShufflesRecursively eventually.
57462 if (NumOps == 2) {
57463 SDValue Src0 = peekThroughBitcasts(Ops[0]);
57464 SDValue Src1 = peekThroughBitcasts(Ops[1]);
57465 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57467 EVT SrcVT0 = Src0.getOperand(0).getValueType();
57468 EVT SrcVT1 = Src1.getOperand(0).getValueType();
57469 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
57470 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
57471 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
57472 // Only concat of subvector high halves which vperm2x128 is best at.
57473 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
57474 SrcVT1.is256BitVector() &&
57475 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
57476 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
57477 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
57478 DAG.getBitcast(VT, Src0.getOperand(0)),
57479 DAG.getBitcast(VT, Src1.getOperand(0)),
57480 DAG.getTargetConstant(0x31, DL, MVT::i8));
57481 }
57482 // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x.
57483 if (Src0.getOperand(0) == Src1.getOperand(0) &&
57484 Src0.getConstantOperandAPInt(1) == 0 &&
57485 Src1.getConstantOperandAPInt(1) ==
57487 return DAG.getBitcast(VT, extractSubVector(Src0.getOperand(0), 0, DAG,
57488 DL, VT.getSizeInBits()));
57489 }
57490 }
57491 }
57492
57493 // Repeated opcode.
57494 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
57495 // but it currently struggles with different vector widths.
57496 if (llvm::all_of(Ops, [Op0](SDValue Op) {
57497 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
57498 })) {
57499 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
57501 for (SDValue SubOp : SubOps)
57502 Subs.push_back(SubOp.getOperand(I));
57503 // Attempt to peek through bitcasts and concat the original subvectors.
57504 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
57505 if (SubVT.isSimple() && SubVT.isVector()) {
57506 EVT ConcatVT =
57508 SubVT.getVectorElementCount() * Subs.size());
57509 for (SDValue &Sub : Subs)
57510 Sub = DAG.getBitcast(SubVT, Sub);
57511 return DAG.getBitcast(
57512 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
57513 }
57514 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
57515 };
57516 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
57517 bool AllConstants = true;
57518 bool AllSubs = true;
57519 unsigned VecSize = VT.getSizeInBits();
57520 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
57521 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
57522 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
57523 }))
57524 return true;
57525 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
57526 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
57527 unsigned SubSize = BC.getValueSizeInBits();
57528 unsigned EltSize = BC.getScalarValueSizeInBits();
57529 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
57531 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57532 BC.getOperand(0).getValueSizeInBits() == VecSize &&
57533 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
57534 }
57535 return AllConstants || AllSubs;
57536 };
57537
57538 switch (Op0.getOpcode()) {
57539 case ISD::VECTOR_SHUFFLE: {
57540 if (NumOps == 2 && VT.is256BitVector() &&
57541 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
57542 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
57543 int NumSubElts = Op0.getValueType().getVectorNumElements();
57544 SmallVector<int> NewMask;
57545 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
57546 M = M >= NumSubElts ? M + NumSubElts : M;
57547 NewMask.push_back(M);
57548 }
57549 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
57550 if (0 <= M)
57551 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
57552 NewMask.push_back(M);
57553 }
57554 return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0),
57555 ConcatSubOperand(VT, Ops, 1), NewMask);
57556 }
57557 break;
57558 }
57559 case X86ISD::VBROADCAST: {
57560 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
57561 return Op.getOperand(0).getValueType().is128BitVector();
57562 })) {
57563 if (VT == MVT::v4f64 || VT == MVT::v4i64)
57564 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
57565 ConcatSubOperand(VT, Ops, 0),
57566 ConcatSubOperand(VT, Ops, 0));
57567 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
57568 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
57569 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
57571 DL, VT, ConcatSubOperand(VT, Ops, 0),
57572 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
57573 }
57574 break;
57575 }
57576 case X86ISD::MOVDDUP:
57577 case X86ISD::MOVSHDUP:
57578 case X86ISD::MOVSLDUP: {
57579 if (!IsSplat)
57580 return DAG.getNode(Op0.getOpcode(), DL, VT,
57581 ConcatSubOperand(VT, Ops, 0));
57582 break;
57583 }
57584 case X86ISD::SHUFP: {
57585 // Add SHUFPD support if/when necessary.
57586 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
57587 llvm::all_of(Ops, [Op0](SDValue Op) {
57588 return Op.getOperand(2) == Op0.getOperand(2);
57589 })) {
57590 return DAG.getNode(Op0.getOpcode(), DL, VT,
57591 ConcatSubOperand(VT, Ops, 0),
57592 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57593 }
57594 break;
57595 }
57596 case X86ISD::UNPCKH:
57597 case X86ISD::UNPCKL: {
57598 // Don't concatenate build_vector patterns.
57599 if (!IsSplat && EltSizeInBits >= 32 &&
57600 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57601 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57602 none_of(Ops, [](SDValue Op) {
57603 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
57605 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
57607 })) {
57608 return DAG.getNode(Op0.getOpcode(), DL, VT,
57609 ConcatSubOperand(VT, Ops, 0),
57610 ConcatSubOperand(VT, Ops, 1));
57611 }
57612 break;
57613 }
57614 case X86ISD::PSHUFHW:
57615 case X86ISD::PSHUFLW:
57616 case X86ISD::PSHUFD:
57617 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
57618 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
57619 return DAG.getNode(Op0.getOpcode(), DL, VT,
57620 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57621 }
57622 [[fallthrough]];
57623 case X86ISD::VPERMILPI:
57624 if (!IsSplat && EltSizeInBits == 32 &&
57625 (VT.is256BitVector() ||
57626 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57627 all_of(Ops, [&Op0](SDValue Op) {
57628 return Op0.getOperand(1) == Op.getOperand(1);
57629 })) {
57630 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
57631 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
57632 Res =
57633 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
57634 return DAG.getBitcast(VT, Res);
57635 }
57636 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
57637 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
57638 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
57639 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
57640 return DAG.getNode(Op0.getOpcode(), DL, VT,
57641 ConcatSubOperand(VT, Ops, 0),
57642 DAG.getTargetConstant(Idx, DL, MVT::i8));
57643 }
57644 break;
57645 case X86ISD::PSHUFB:
57646 case X86ISD::PSADBW:
57647 case X86ISD::VPMADDUBSW:
57648 case X86ISD::VPMADDWD:
57649 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57650 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57651 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57652 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57653 NumOps * SrcVT.getVectorNumElements());
57654 return DAG.getNode(Op0.getOpcode(), DL, VT,
57655 ConcatSubOperand(SrcVT, Ops, 0),
57656 ConcatSubOperand(SrcVT, Ops, 1));
57657 }
57658 break;
57659 case X86ISD::VPERMV:
57660 if (!IsSplat && NumOps == 2 &&
57661 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
57662 MVT OpVT = Op0.getSimpleValueType();
57663 int NumSrcElts = OpVT.getVectorNumElements();
57664 SmallVector<int, 64> ConcatMask;
57665 for (unsigned i = 0; i != NumOps; ++i) {
57666 SmallVector<int, 64> SubMask;
57668 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
57669 break;
57670 for (int M : SubMask) {
57671 if (0 <= M)
57672 M += i * NumSrcElts;
57673 ConcatMask.push_back(M);
57674 }
57675 }
57676 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
57677 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
57678 Ops[1].getOperand(1), DAG, DL);
57679 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
57680 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
57681 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
57682 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
57683 }
57684 }
57685 break;
57686 case X86ISD::VPERMV3:
57687 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
57688 MVT OpVT = Op0.getSimpleValueType();
57689 int NumSrcElts = OpVT.getVectorNumElements();
57690 SmallVector<int, 64> ConcatMask;
57691 for (unsigned i = 0; i != NumOps; ++i) {
57692 SmallVector<int, 64> SubMask;
57694 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
57695 break;
57696 for (int M : SubMask) {
57697 if (0 <= M) {
57698 int Src = M < NumSrcElts ? 0 : 2;
57699 M += M < NumSrcElts ? 0 : NumSrcElts;
57700
57701 // Reference the lowest sub if they upper sub is the same.
57702 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
57703 M += i * NumSrcElts;
57704 }
57705 ConcatMask.push_back(M);
57706 }
57707 }
57708 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
57709 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
57710 Ops[1].getOperand(0), DAG, DL);
57711 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
57712 Ops[1].getOperand(2), DAG, DL);
57713 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
57714 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
57715 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
57716 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
57717 }
57718 }
57719 break;
57720 case X86ISD::VPERM2X128: {
57721 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
57722 assert(NumOps == 2 && "Bad concat_vectors operands");
57723 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
57724 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
57725 // TODO: Handle zero'd subvectors.
57726 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
57727 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
57728 (int)((Imm1 >> 4) & 0x3)};
57729 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
57730 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
57731 Ops[0].getOperand(1), DAG, DL);
57732 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
57733 Ops[1].getOperand(1), DAG, DL);
57734 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
57735 DAG.getBitcast(ShuffleVT, LHS),
57736 DAG.getBitcast(ShuffleVT, RHS),
57737 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
57738 return DAG.getBitcast(VT, Res);
57739 }
57740 }
57741 break;
57742 }
57743 case X86ISD::SHUF128: {
57744 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
57745 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
57746 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
57747 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
57748 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
57749 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
57750 Ops[0].getOperand(1), DAG, DL);
57751 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
57752 Ops[1].getOperand(1), DAG, DL);
57753 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
57754 DAG.getTargetConstant(Imm, DL, MVT::i8));
57755 }
57756 break;
57757 }
57758 case ISD::TRUNCATE:
57759 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
57760 EVT SrcVT = Ops[0].getOperand(0).getValueType();
57761 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
57762 SrcVT == Ops[1].getOperand(0).getValueType() &&
57763 Subtarget.useAVX512Regs() &&
57764 Subtarget.getPreferVectorWidth() >= 512 &&
57765 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
57766 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
57767 return DAG.getNode(ISD::TRUNCATE, DL, VT,
57768 ConcatSubOperand(NewSrcVT, Ops, 0));
57769 }
57770 }
57771 break;
57772 case ISD::ANY_EXTEND:
57773 case ISD::SIGN_EXTEND:
57774 case ISD::ZERO_EXTEND:
57775 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
57776 if (!IsSplat && NumOps == 2 &&
57777 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57778 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57779 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
57780 EVT SrcVT = Ops[0].getOperand(0).getValueType();
57781 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
57782 SrcVT == Ops[1].getOperand(0).getValueType()) {
57783 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
57784 return DAG.getNode(Op0.getOpcode(), DL, VT,
57785 ConcatSubOperand(NewSrcVT, Ops, 0));
57786 }
57787 }
57788 break;
57789 case X86ISD::VSHLI:
57790 case X86ISD::VSRLI:
57791 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
57792 // TODO: Move this to LowerShiftByScalarImmediate?
57793 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
57794 llvm::all_of(Ops, [](SDValue Op) {
57795 return Op.getConstantOperandAPInt(1) == 32;
57796 })) {
57797 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
57798 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
57799 if (Op0.getOpcode() == X86ISD::VSHLI) {
57800 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
57801 {8, 0, 8, 2, 8, 4, 8, 6});
57802 } else {
57803 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
57804 {1, 8, 3, 8, 5, 8, 7, 8});
57805 }
57806 return DAG.getBitcast(VT, Res);
57807 }
57808 [[fallthrough]];
57809 case X86ISD::VSRAI:
57810 case X86ISD::VSHL:
57811 case X86ISD::VSRL:
57812 case X86ISD::VSRA:
57813 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
57814 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57815 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
57816 llvm::all_of(Ops, [Op0](SDValue Op) {
57817 return Op0.getOperand(1) == Op.getOperand(1);
57818 })) {
57819 return DAG.getNode(Op0.getOpcode(), DL, VT,
57820 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57821 }
57822 break;
57823 case X86ISD::VPERMI:
57824 case X86ISD::VROTLI:
57825 case X86ISD::VROTRI:
57826 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57827 llvm::all_of(Ops, [Op0](SDValue Op) {
57828 return Op0.getOperand(1) == Op.getOperand(1);
57829 })) {
57830 return DAG.getNode(Op0.getOpcode(), DL, VT,
57831 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57832 }
57833 break;
57834 case ISD::AND:
57835 case ISD::OR:
57836 case ISD::XOR:
57837 case X86ISD::ANDNP:
57838 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57839 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57840 return DAG.getNode(Op0.getOpcode(), DL, VT,
57841 ConcatSubOperand(VT, Ops, 0),
57842 ConcatSubOperand(VT, Ops, 1));
57843 }
57844 break;
57845 case X86ISD::PCMPEQ:
57846 case X86ISD::PCMPGT:
57847 if (!IsSplat && VT.is256BitVector() &&
57848 (Subtarget.hasInt256() || VT == MVT::v8i32) &&
57849 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
57850 if (Subtarget.hasInt256())
57851 return DAG.getNode(Op0.getOpcode(), DL, VT,
57852 ConcatSubOperand(VT, Ops, 0),
57853 ConcatSubOperand(VT, Ops, 1));
57854
57855 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
57856 // TODO: Handle v4f64 as well?
57857 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
57858 for (unsigned I = 0; I != NumOps; ++I) {
57859 MaxSigBitsLHS =
57860 std::max(MaxSigBitsLHS,
57861 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
57862 MaxSigBitsRHS =
57863 std::max(MaxSigBitsRHS,
57864 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
57865 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
57866 break;
57867 }
57868
57869 ISD::CondCode ICC =
57871 ISD::CondCode FCC =
57873
57874 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
57875 MVT FpVT = VT.changeVectorElementType(FpSVT);
57876
57877 if (std::optional<unsigned> CastOpc =
57878 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
57879 SDValue LHS = ConcatSubOperand(VT, Ops, 0);
57880 SDValue RHS = ConcatSubOperand(VT, Ops, 1);
57881 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
57882 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
57883
57884 bool IsAlwaysSignaling;
57885 unsigned FSETCC =
57886 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
57887 return DAG.getBitcast(
57888 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
57889 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
57890 }
57891 }
57892 break;
57893 case ISD::CTPOP:
57894 case ISD::CTTZ:
57895 case ISD::CTLZ:
57898 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57899 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57900 return DAG.getNode(Op0.getOpcode(), DL, VT,
57901 ConcatSubOperand(VT, Ops, 0));
57902 }
57903 break;
57905 if (!IsSplat &&
57906 (VT.is256BitVector() ||
57907 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57908 llvm::all_of(Ops, [Op0](SDValue Op) {
57909 return Op0.getOperand(2) == Op.getOperand(2);
57910 })) {
57911 return DAG.getNode(Op0.getOpcode(), DL, VT,
57912 ConcatSubOperand(VT, Ops, 0),
57913 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57914 }
57915 break;
57916 case ISD::ADD:
57917 case ISD::SUB:
57918 case ISD::MUL:
57919 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57920 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57921 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
57922 return DAG.getNode(Op0.getOpcode(), DL, VT,
57923 ConcatSubOperand(VT, Ops, 0),
57924 ConcatSubOperand(VT, Ops, 1));
57925 }
57926 break;
57927 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
57928 // their latency are short, so here we don't replace them unless we won't
57929 // introduce extra VINSERT.
57930 case ISD::FADD:
57931 case ISD::FSUB:
57932 case ISD::FMUL:
57933 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
57934 (VT.is256BitVector() ||
57935 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57936 return DAG.getNode(Op0.getOpcode(), DL, VT,
57937 ConcatSubOperand(VT, Ops, 0),
57938 ConcatSubOperand(VT, Ops, 1));
57939 }
57940 break;
57941 case ISD::FDIV:
57942 if (!IsSplat && (VT.is256BitVector() ||
57943 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57944 return DAG.getNode(Op0.getOpcode(), DL, VT,
57945 ConcatSubOperand(VT, Ops, 0),
57946 ConcatSubOperand(VT, Ops, 1));
57947 }
57948 break;
57949 case X86ISD::HADD:
57950 case X86ISD::HSUB:
57951 case X86ISD::FHADD:
57952 case X86ISD::FHSUB:
57953 if (!IsSplat && VT.is256BitVector() &&
57954 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
57955 return DAG.getNode(Op0.getOpcode(), DL, VT,
57956 ConcatSubOperand(VT, Ops, 0),
57957 ConcatSubOperand(VT, Ops, 1));
57958 }
57959 break;
57960 case X86ISD::PACKSS:
57961 case X86ISD::PACKUS:
57962 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57963 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
57964 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57965 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57966 NumOps * SrcVT.getVectorNumElements());
57967 return DAG.getNode(Op0.getOpcode(), DL, VT,
57968 ConcatSubOperand(SrcVT, Ops, 0),
57969 ConcatSubOperand(SrcVT, Ops, 1));
57970 }
57971 break;
57972 case X86ISD::PALIGNR:
57973 if (!IsSplat &&
57974 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57975 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
57976 llvm::all_of(Ops, [Op0](SDValue Op) {
57977 return Op0.getOperand(2) == Op.getOperand(2);
57978 })) {
57979 return DAG.getNode(Op0.getOpcode(), DL, VT,
57980 ConcatSubOperand(VT, Ops, 0),
57981 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57982 }
57983 break;
57984 case X86ISD::BLENDI:
57985 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
57986 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
57987 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
57988 // MVT::v16i16 has repeated blend mask.
57989 if (Op0.getSimpleValueType() == MVT::v16i16) {
57990 Mask0 = (Mask0 << 8) | Mask0;
57991 Mask1 = (Mask1 << 8) | Mask1;
57992 }
57993 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
57995 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
57996 SDValue Sel =
57997 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
57998 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
57999 ConcatSubOperand(VT, Ops, 0));
58000 }
58001 break;
58002 case ISD::VSELECT:
58003 if (!IsSplat && Subtarget.hasAVX512() &&
58004 (VT.is256BitVector() ||
58005 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58006 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
58007 EVT SelVT = Ops[0].getOperand(0).getValueType();
58008 if (SelVT.getVectorElementType() == MVT::i1) {
58009 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
58010 NumOps * SelVT.getVectorNumElements());
58011 if (TLI.isTypeLegal(SelVT))
58012 return DAG.getNode(Op0.getOpcode(), DL, VT,
58013 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
58014 ConcatSubOperand(VT, Ops, 1),
58015 ConcatSubOperand(VT, Ops, 2));
58016 }
58017 }
58018 [[fallthrough]];
58019 case X86ISD::BLENDV:
58020 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
58021 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
58022 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
58023 EVT SelVT = Ops[0].getOperand(0).getValueType();
58024 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
58025 if (TLI.isTypeLegal(SelVT))
58026 return DAG.getNode(Op0.getOpcode(), DL, VT,
58027 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
58028 ConcatSubOperand(VT, Ops, 1),
58029 ConcatSubOperand(VT, Ops, 2));
58030 }
58031 break;
58032 }
58033 }
58034
58035 // Fold subvector loads into one.
58036 // If needed, look through bitcasts to get to the load.
58037 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
58038 unsigned Fast;
58039 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
58040 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
58041 *FirstLd->getMemOperand(), &Fast) &&
58042 Fast) {
58043 if (SDValue Ld =
58044 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
58045 return Ld;
58046 }
58047 }
58048
58049 // Attempt to fold target constant loads.
58050 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
58051 SmallVector<APInt> EltBits;
58052 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
58053 for (unsigned I = 0; I != NumOps; ++I) {
58054 APInt OpUndefElts;
58055 SmallVector<APInt> OpEltBits;
58056 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
58057 OpEltBits, /*AllowWholeUndefs*/ true,
58058 /*AllowPartialUndefs*/ false))
58059 break;
58060 EltBits.append(OpEltBits);
58061 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
58062 }
58063 if (EltBits.size() == VT.getVectorNumElements()) {
58064 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
58065 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
58066 SDValue CV = DAG.getConstantPool(C, PVT);
58069 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
58070 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
58071 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
58072 return Ld;
58073 }
58074 }
58075
58076 // If this simple subvector or scalar/subvector broadcast_load is inserted
58077 // into both halves, use a larger broadcast_load. Update other uses to use
58078 // an extracted subvector.
58079 if (IsSplat &&
58080 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58081 if (ISD::isNormalLoad(Op0.getNode()) ||
58084 auto *Mem = cast<MemSDNode>(Op0);
58085 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
58088 if (SDValue BcastLd =
58089 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
58090 SDValue BcastSrc =
58091 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
58092 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
58093 return BcastLd;
58094 }
58095 }
58096 }
58097
58098 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
58099 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
58100 Subtarget.useAVX512Regs()) {
58101 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58102 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
58103 Res = DAG.getBitcast(ShuffleVT, Res);
58104 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
58105 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58106 return DAG.getBitcast(VT, Res);
58107 }
58108
58109 return SDValue();
58110}
58111
58114 const X86Subtarget &Subtarget) {
58115 EVT VT = N->getValueType(0);
58116 EVT SrcVT = N->getOperand(0).getValueType();
58117 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58118 SmallVector<SDValue, 4> Ops(N->ops());
58119
58120 if (VT.getVectorElementType() == MVT::i1) {
58121 // Attempt to constant fold.
58122 unsigned SubSizeInBits = SrcVT.getSizeInBits();
58124 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
58125 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
58126 if (!C) break;
58127 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
58128 if (I == (E - 1)) {
58129 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
58130 if (TLI.isTypeLegal(IntVT))
58131 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
58132 }
58133 }
58134
58135 // Don't do anything else for i1 vectors.
58136 return SDValue();
58137 }
58138
58139 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
58140 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
58141 DCI, Subtarget))
58142 return R;
58143 }
58144
58145 return SDValue();
58146}
58147
58150 const X86Subtarget &Subtarget) {
58151 if (DCI.isBeforeLegalizeOps())
58152 return SDValue();
58153
58154 MVT OpVT = N->getSimpleValueType(0);
58155
58156 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
58157
58158 SDLoc dl(N);
58159 SDValue Vec = N->getOperand(0);
58160 SDValue SubVec = N->getOperand(1);
58161
58162 uint64_t IdxVal = N->getConstantOperandVal(2);
58163 MVT SubVecVT = SubVec.getSimpleValueType();
58164
58165 if (Vec.isUndef() && SubVec.isUndef())
58166 return DAG.getUNDEF(OpVT);
58167
58168 // Inserting undefs/zeros into zeros/undefs is a zero vector.
58169 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
58170 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
58171 return getZeroVector(OpVT, Subtarget, DAG, dl);
58172
58174 // If we're inserting into a zero vector and then into a larger zero vector,
58175 // just insert into the larger zero vector directly.
58176 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
58178 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
58179 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58180 getZeroVector(OpVT, Subtarget, DAG, dl),
58181 SubVec.getOperand(1),
58182 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
58183 }
58184
58185 // If we're inserting into a zero vector and our input was extracted from an
58186 // insert into a zero vector of the same type and the extraction was at
58187 // least as large as the original insertion. Just insert the original
58188 // subvector into a zero vector.
58189 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
58190 isNullConstant(SubVec.getOperand(1)) &&
58192 SDValue Ins = SubVec.getOperand(0);
58193 if (isNullConstant(Ins.getOperand(2)) &&
58194 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
58195 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
58196 SubVecVT.getFixedSizeInBits())
58197 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58198 getZeroVector(OpVT, Subtarget, DAG, dl),
58199 Ins.getOperand(1), N->getOperand(2));
58200 }
58201 }
58202
58203 // Stop here if this is an i1 vector.
58204 if (IsI1Vector)
58205 return SDValue();
58206
58207 // Eliminate an intermediate vector widening:
58208 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
58209 // insert_subvector X, Y, Idx
58210 // TODO: This is a more general version of a DAGCombiner fold, can we move it
58211 // there?
58212 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
58213 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
58214 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
58215 SubVec.getOperand(1), N->getOperand(2));
58216
58217 // If this is an insert of an extract, combine to a shuffle. Don't do this
58218 // if the insert or extract can be represented with a subregister operation.
58219 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58220 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
58221 (IdxVal != 0 ||
58222 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
58223 int ExtIdxVal = SubVec.getConstantOperandVal(1);
58224 if (ExtIdxVal != 0) {
58225 int VecNumElts = OpVT.getVectorNumElements();
58226 int SubVecNumElts = SubVecVT.getVectorNumElements();
58227 SmallVector<int, 64> Mask(VecNumElts);
58228 // First create an identity shuffle mask.
58229 for (int i = 0; i != VecNumElts; ++i)
58230 Mask[i] = i;
58231 // Now insert the extracted portion.
58232 for (int i = 0; i != SubVecNumElts; ++i)
58233 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
58234
58235 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
58236 }
58237 }
58238
58239 // Match concat_vector style patterns.
58240 SmallVector<SDValue, 2> SubVectorOps;
58241 if (collectConcatOps(N, SubVectorOps, DAG)) {
58242 if (SDValue Fold =
58243 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
58244 return Fold;
58245
58246 // If we're inserting all zeros into the upper half, change this to
58247 // a concat with zero. We will match this to a move
58248 // with implicit upper bit zeroing during isel.
58249 // We do this here because we don't want combineConcatVectorOps to
58250 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
58251 if (SubVectorOps.size() == 2 &&
58252 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
58253 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
58254 getZeroVector(OpVT, Subtarget, DAG, dl),
58255 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
58256
58257 // Attempt to recursively combine to a shuffle.
58258 if (all_of(SubVectorOps, [](SDValue SubOp) {
58259 return isTargetShuffle(SubOp.getOpcode());
58260 })) {
58261 SDValue Op(N, 0);
58262 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
58263 return Res;
58264 }
58265 }
58266
58267 // If this is a broadcast insert into an upper undef, use a larger broadcast.
58268 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
58269 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
58270
58271 // If this is a broadcast load inserted into an upper undef, use a larger
58272 // broadcast load.
58273 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
58274 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
58275 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
58276 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
58277 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
58278 SDValue BcastLd =
58280 MemIntr->getMemoryVT(),
58281 MemIntr->getMemOperand());
58282 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
58283 return BcastLd;
58284 }
58285
58286 // If we're splatting the lower half subvector of a full vector load into the
58287 // upper half, attempt to create a subvector broadcast.
58288 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
58289 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
58290 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
58291 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
58292 if (VecLd && SubLd &&
58293 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
58294 SubVec.getValueSizeInBits() / 8, 0))
58295 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
58296 SubLd, 0, DAG);
58297 }
58298
58299 return SDValue();
58300}
58301
58302/// If we are extracting a subvector of a vector select and the select condition
58303/// is composed of concatenated vectors, try to narrow the select width. This
58304/// is a common pattern for AVX1 integer code because 256-bit selects may be
58305/// legal, but there is almost no integer math/logic available for 256-bit.
58306/// This function should only be called with legal types (otherwise, the calls
58307/// to get simple value types will assert).
58309 SelectionDAG &DAG) {
58310 SDValue Sel = Ext->getOperand(0);
58311 if (Sel.getOpcode() != ISD::VSELECT ||
58312 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
58313 return SDValue();
58314
58315 // Note: We assume simple value types because this should only be called with
58316 // legal operations/types.
58317 // TODO: This can be extended to handle extraction to 256-bits.
58318 MVT VT = Ext->getSimpleValueType(0);
58319 if (!VT.is128BitVector())
58320 return SDValue();
58321
58322 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
58323 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
58324 return SDValue();
58325
58326 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
58327 MVT SelVT = Sel.getSimpleValueType();
58328 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
58329 "Unexpected vector type with legal operations");
58330
58331 unsigned SelElts = SelVT.getVectorNumElements();
58332 unsigned CastedElts = WideVT.getVectorNumElements();
58333 unsigned ExtIdx = Ext->getConstantOperandVal(1);
58334 if (SelElts % CastedElts == 0) {
58335 // The select has the same or more (narrower) elements than the extract
58336 // operand. The extraction index gets scaled by that factor.
58337 ExtIdx *= (SelElts / CastedElts);
58338 } else if (CastedElts % SelElts == 0) {
58339 // The select has less (wider) elements than the extract operand. Make sure
58340 // that the extraction index can be divided evenly.
58341 unsigned IndexDivisor = CastedElts / SelElts;
58342 if (ExtIdx % IndexDivisor != 0)
58343 return SDValue();
58344 ExtIdx /= IndexDivisor;
58345 } else {
58346 llvm_unreachable("Element count of simple vector types are not divisible?");
58347 }
58348
58349 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
58350 unsigned NarrowElts = SelElts / NarrowingFactor;
58351 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
58352 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
58353 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
58354 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
58355 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
58356 return DAG.getBitcast(VT, NarrowSel);
58357}
58358
58361 const X86Subtarget &Subtarget) {
58362 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
58363 // eventually get combined/lowered into ANDNP) with a concatenated operand,
58364 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
58365 // We let generic combining take over from there to simplify the
58366 // insert/extract and 'not'.
58367 // This pattern emerges during AVX1 legalization. We handle it before lowering
58368 // to avoid complications like splitting constant vector loads.
58369
58370 // Capture the original wide type in the likely case that we need to bitcast
58371 // back to this type.
58372 if (!N->getValueType(0).isSimple())
58373 return SDValue();
58374
58375 MVT VT = N->getSimpleValueType(0);
58376 SDValue InVec = N->getOperand(0);
58377 unsigned IdxVal = N->getConstantOperandVal(1);
58378 SDValue InVecBC = peekThroughBitcasts(InVec);
58379 EVT InVecVT = InVec.getValueType();
58380 unsigned SizeInBits = VT.getSizeInBits();
58381 unsigned InSizeInBits = InVecVT.getSizeInBits();
58382 unsigned NumSubElts = VT.getVectorNumElements();
58383 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58384 SDLoc DL(N);
58385
58386 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
58387 TLI.isTypeLegal(InVecVT) &&
58388 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
58389 auto isConcatenatedNot = [](SDValue V) {
58390 V = peekThroughBitcasts(V);
58391 if (!isBitwiseNot(V))
58392 return false;
58393 SDValue NotOp = V->getOperand(0);
58395 };
58396 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
58397 isConcatenatedNot(InVecBC.getOperand(1))) {
58398 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
58399 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
58400 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
58401 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
58402 }
58403 }
58404
58405 if (DCI.isBeforeLegalizeOps())
58406 return SDValue();
58407
58408 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
58409 return V;
58410
58412 return getZeroVector(VT, Subtarget, DAG, DL);
58413
58414 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
58415 if (VT.getScalarType() == MVT::i1)
58416 return DAG.getConstant(1, DL, VT);
58417 return getOnesVector(VT, DAG, DL);
58418 }
58419
58420 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
58421 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
58422
58423 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
58424 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58425 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
58426 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
58427 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
58428 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
58429 }
58430
58431 // If we are extracting from an insert into a larger vector, replace with a
58432 // smaller insert if we don't access less than the original subvector. Don't
58433 // do this for i1 vectors.
58434 // TODO: Relax the matching indices requirement?
58435 if (VT.getVectorElementType() != MVT::i1 &&
58436 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
58437 IdxVal == InVec.getConstantOperandVal(2) &&
58438 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
58439 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
58440 InVec.getOperand(0), N->getOperand(1));
58441 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
58442 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
58443 InVec.getOperand(1),
58444 DAG.getVectorIdxConstant(NewIdxVal, DL));
58445 }
58446
58447 // If we're extracting an upper subvector from a broadcast we should just
58448 // extract the lowest subvector instead which should allow
58449 // SimplifyDemandedVectorElts do more simplifications.
58450 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
58452 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
58453 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
58454
58455 // If we're extracting a broadcasted subvector, just use the lowest subvector.
58456 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58457 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
58458 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
58459
58460 // Attempt to extract from the source of a shuffle vector.
58461 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
58462 SmallVector<int, 32> ShuffleMask;
58463 SmallVector<int, 32> ScaledMask;
58464 SmallVector<SDValue, 2> ShuffleInputs;
58465 unsigned NumSubVecs = InSizeInBits / SizeInBits;
58466 // Decode the shuffle mask and scale it so its shuffling subvectors.
58467 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
58468 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
58469 unsigned SubVecIdx = IdxVal / NumSubElts;
58470 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
58471 return DAG.getUNDEF(VT);
58472 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
58473 return getZeroVector(VT, Subtarget, DAG, DL);
58474 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
58475 if (Src.getValueSizeInBits() == InSizeInBits) {
58476 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
58477 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
58478 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
58479 DL, SizeInBits);
58480 }
58481 }
58482 }
58483
58484 auto IsExtractFree = [](SDValue V) {
58485 if (V.hasOneUse()) {
58487 if (V.getOpcode() == ISD::LOAD)
58488 return true;
58489 }
58490 V = peekThroughBitcasts(V);
58491 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
58492 return true;
58494 return true;
58495 return V.isUndef();
58496 };
58497
58498 // If we're extracting the lowest subvector and we're the only user,
58499 // we may be able to perform this with a smaller vector width.
58500 unsigned InOpcode = InVec.getOpcode();
58501 if (InVec.hasOneUse()) {
58502 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
58503 // v2f64 CVTDQ2PD(v4i32).
58504 if (InOpcode == ISD::SINT_TO_FP &&
58505 InVec.getOperand(0).getValueType() == MVT::v4i32) {
58506 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
58507 }
58508 // v2f64 CVTUDQ2PD(v4i32).
58509 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
58510 InVec.getOperand(0).getValueType() == MVT::v4i32) {
58511 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
58512 }
58513 // v2f64 CVTPS2PD(v4f32).
58514 if (InOpcode == ISD::FP_EXTEND &&
58515 InVec.getOperand(0).getValueType() == MVT::v4f32) {
58516 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
58517 }
58518 }
58519 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
58520 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
58521 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
58522 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
58523 Subtarget.hasVLX())) &&
58524 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
58525 SDValue Src = InVec.getOperand(0);
58526 if (Src.getValueType().getScalarSizeInBits() == 32)
58527 return DAG.getNode(InOpcode, DL, VT,
58528 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
58529 }
58530 if (IdxVal == 0 &&
58531 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
58532 (SizeInBits == 128 || SizeInBits == 256) &&
58533 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
58534 SDValue Ext = InVec.getOperand(0);
58535 if (Ext.getValueSizeInBits() > SizeInBits)
58536 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
58537 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
58538 return DAG.getNode(ExtOp, DL, VT, Ext);
58539 }
58540 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
58541 InVec.getOperand(0).getValueType().is256BitVector() &&
58542 InVec.getOperand(1).getValueType().is256BitVector() &&
58543 InVec.getOperand(2).getValueType().is256BitVector()) {
58544 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
58545 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
58546 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
58547 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
58548 }
58549 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
58550 (SizeInBits == 128 || SizeInBits == 256)) {
58551 SDValue InVecSrc = InVec.getOperand(0);
58552 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
58553 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
58554 return DAG.getNode(InOpcode, DL, VT, Ext);
58555 }
58556
58557 if (SizeInBits == 128 || SizeInBits == 256) {
58558 switch (InOpcode) {
58559 case X86ISD::MOVDDUP:
58560 return DAG.getNode(
58561 InOpcode, DL, VT,
58562 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
58563 case X86ISD::PSHUFD:
58564 case X86ISD::VPERMILPI:
58565 if (InVec.getOperand(0).hasOneUse()) {
58566 uint64_t M = InVec.getConstantOperandVal(1) & 255;
58567 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
58568 return DAG.getNode(InOpcode, DL, VT,
58569 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58570 DL, SizeInBits),
58571 DAG.getTargetConstant(M, DL, MVT::i8));
58572 }
58573 break;
58574 case X86ISD::PCMPEQ:
58575 case X86ISD::PCMPGT:
58576 case X86ISD::UNPCKH:
58577 case X86ISD::UNPCKL:
58578 if (IsExtractFree(InVec.getOperand(0)) ||
58579 IsExtractFree(InVec.getOperand(1)))
58580 return DAG.getNode(InOpcode, DL, VT,
58581 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58582 DL, SizeInBits),
58583 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58584 DL, SizeInBits));
58585 break;
58586 case X86ISD::CMPP:
58587 if (IsExtractFree(InVec.getOperand(0)) ||
58588 IsExtractFree(InVec.getOperand(1)))
58589 return DAG.getNode(InOpcode, DL, VT,
58590 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58591 DL, SizeInBits),
58592 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58593 DL, SizeInBits),
58594 InVec.getOperand(2));
58595 break;
58596 case X86ISD::BLENDI:
58597 if (IsExtractFree(InVec.getOperand(0)) ||
58598 IsExtractFree(InVec.getOperand(1))) {
58599 uint64_t M = InVec.getConstantOperandVal(2) & 255;
58600 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
58601 return DAG.getNode(InOpcode, DL, VT,
58602 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
58603 DL, SizeInBits),
58604 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
58605 DL, SizeInBits),
58606 DAG.getTargetConstant(M, DL, MVT::i8));
58607 }
58608 break;
58609 case X86ISD::VPERMV3:
58610 if (IdxVal != 0) {
58611 SDValue Src0 = InVec.getOperand(0);
58612 SDValue Mask = InVec.getOperand(1);
58613 SDValue Src1 = InVec.getOperand(2);
58614 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
58615 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
58616 DL, InSizeInBits);
58617 SDValue Shuffle =
58618 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
58619 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
58620 }
58621 break;
58622 }
58623 }
58624 }
58625
58626 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
58627 // as this is very likely to fold into a shuffle/truncation.
58628 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
58629 InVecVT.getScalarSizeInBits() == 64 &&
58630 InVec.getConstantOperandAPInt(1) == 32) {
58631 SDValue Ext =
58632 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
58633 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
58634 }
58635
58636 return SDValue();
58637}
58638
58640 const X86Subtarget &Subtarget) {
58641 using namespace SDPatternMatch;
58642 EVT VT = N->getValueType(0);
58643 SDValue Src = N->getOperand(0);
58644 SDLoc DL(N);
58645
58646 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
58647 // This occurs frequently in our masked scalar intrinsic code and our
58648 // floating point select lowering with AVX512.
58649 // TODO: SimplifyDemandedBits instead?
58650 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
58651 isOneConstant(Src.getOperand(1)))
58652 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
58653
58654 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
58655 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58656 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
58657 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
58658 isNullConstant(Src.getOperand(1)))
58659 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
58660 Src.getOperand(1));
58661
58662 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
58663 // TODO: Move to DAGCombine/SimplifyDemandedBits?
58664 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
58665 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
58666 if (Op.getValueType() != MVT::i64)
58667 return SDValue();
58668 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
58669 if (Op.getOpcode() == Opc &&
58670 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
58671 return Op.getOperand(0);
58672 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
58673 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
58674 if (Ld->getExtensionType() == Ext &&
58675 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
58676 return Op;
58677 if (IsZeroExt) {
58678 KnownBits Known = DAG.computeKnownBits(Op);
58679 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
58680 return Op;
58681 }
58682 return SDValue();
58683 };
58684
58685 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
58686 return DAG.getBitcast(
58687 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
58688 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
58689
58690 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
58691 return DAG.getBitcast(
58692 VT,
58693 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
58694 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
58695 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
58696 }
58697
58698 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST) {
58699 SDValue SrcOp = Src.getOperand(0);
58700 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
58701 if (SrcOp.getValueType() == MVT::f64)
58702 return DAG.getBitcast(
58703 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
58704 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
58705 if (SrcOp.getValueType() == MVT::x86mmx)
58706 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
58707 }
58708
58709 if (VT == MVT::v4i32) {
58710 SDValue HalfSrc;
58711 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
58712 // to remove XMM->GPR->XMM moves.
58713 if (sd_match(Src, m_AnyExt(m_BitCast(
58714 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
58715 return DAG.getBitcast(
58716 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
58717 }
58718
58719 // See if we're broadcasting the scalar value, in which case just reuse that.
58720 // Ensure the same SDValue from the SDNode use is being used.
58721 if (VT.getScalarType() == Src.getValueType())
58722 for (SDNode *User : Src->users())
58723 if (User->getOpcode() == X86ISD::VBROADCAST &&
58724 Src == User->getOperand(0)) {
58725 unsigned SizeInBits = VT.getFixedSizeInBits();
58726 unsigned BroadcastSizeInBits =
58727 User->getValueSizeInBits(0).getFixedValue();
58728 if (BroadcastSizeInBits == SizeInBits)
58729 return SDValue(User, 0);
58730 if (BroadcastSizeInBits > SizeInBits)
58731 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
58732 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
58733 // coverage.
58734 }
58735
58736 // Check for cases where we've ended up with a scalarized shift, typically
58737 // during type legalization.
58738 switch (Src.getOpcode()) {
58739 case ISD::SHL:
58740 case ISD::SRL:
58741 case ISD::SRA:
58742 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
58743 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
58744 Src.hasOneUse()) {
58745 SDValue SrcVec =
58746 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
58747 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
58748 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
58749 Amt->getZExtValue(), DAG);
58750 }
58751 }
58752 break;
58753 case ISD::FSHL:
58754 case ISD::FSHR:
58755 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
58756 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
58757 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58758 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
58759 Src.hasOneUse()) {
58760 uint64_t AmtVal =
58761 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
58762 SDValue SrcVec0 =
58763 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
58764 SDValue SrcVec1 =
58765 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
58766 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
58767 DAG.getConstant(AmtVal, DL, VT));
58768 }
58769 }
58770 break;
58771 }
58772
58773 return SDValue();
58774}
58775
58776// Simplify PMULDQ and PMULUDQ operations.
58779 const X86Subtarget &Subtarget) {
58780 SDValue LHS = N->getOperand(0);
58781 SDValue RHS = N->getOperand(1);
58782
58783 // Canonicalize constant to RHS.
58786 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
58787
58788 // Multiply by zero.
58789 // Don't return RHS as it may contain UNDEFs.
58790 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
58791 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
58792
58793 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
58794 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58795 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
58796 return SDValue(N, 0);
58797
58798 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
58799 // convert it to any_extend_invec, due to the LegalOperations check, do the
58800 // conversion directly to a vector shuffle manually. This exposes combine
58801 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
58802 // combineX86ShufflesRecursively on SSE4.1 targets.
58803 // FIXME: This is basically a hack around several other issues related to
58804 // ANY_EXTEND_VECTOR_INREG.
58805 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
58806 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
58807 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
58808 LHS.getOperand(0).getValueType() == MVT::v4i32) {
58809 SDLoc dl(N);
58810 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
58811 LHS.getOperand(0), { 0, -1, 1, -1 });
58812 LHS = DAG.getBitcast(MVT::v2i64, LHS);
58813 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58814 }
58815 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
58816 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
58817 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
58818 RHS.getOperand(0).getValueType() == MVT::v4i32) {
58819 SDLoc dl(N);
58820 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
58821 RHS.getOperand(0), { 0, -1, 1, -1 });
58822 RHS = DAG.getBitcast(MVT::v2i64, RHS);
58823 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
58824 }
58825
58826 return SDValue();
58827}
58828
58829// Simplify VPMADDUBSW/VPMADDWD operations.
58832 MVT VT = N->getSimpleValueType(0);
58833 SDValue LHS = N->getOperand(0);
58834 SDValue RHS = N->getOperand(1);
58835 unsigned Opc = N->getOpcode();
58836 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
58837 assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) &&
58838 "Unexpected PMADD opcode");
58839
58840 // Multiply by zero.
58841 // Don't return LHS/RHS as it may contain UNDEFs.
58842 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
58844 return DAG.getConstant(0, SDLoc(N), VT);
58845
58846 // Constant folding.
58847 APInt LHSUndefs, RHSUndefs;
58848 SmallVector<APInt> LHSBits, RHSBits;
58849 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
58850 unsigned DstEltBits = VT.getScalarSizeInBits();
58851 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
58852 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
58853 SmallVector<APInt> Result;
58854 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
58855 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
58856 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
58857 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
58858 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
58859 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
58860 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
58861 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
58862 Result.push_back(Res);
58863 }
58864 return getConstVector(Result, VT, DAG, SDLoc(N));
58865 }
58866
58867 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58868 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
58869 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
58870 return SDValue(N, 0);
58871
58872 return SDValue();
58873}
58874
58877 const X86Subtarget &Subtarget) {
58878 EVT VT = N->getValueType(0);
58879 SDValue In = N->getOperand(0);
58880 unsigned Opcode = N->getOpcode();
58881 unsigned InOpcode = In.getOpcode();
58882 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58883 SDLoc DL(N);
58884
58885 // Try to merge vector loads and extend_inreg to an extload.
58886 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
58887 In.hasOneUse()) {
58888 auto *Ld = cast<LoadSDNode>(In);
58889 if (Ld->isSimple()) {
58890 MVT SVT = In.getSimpleValueType().getVectorElementType();
58893 : ISD::ZEXTLOAD;
58894 EVT MemVT = VT.changeVectorElementType(SVT);
58895 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
58896 SDValue Load = DAG.getExtLoad(
58897 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
58898 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
58899 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
58900 return Load;
58901 }
58902 }
58903 }
58904
58905 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
58906 if (Opcode == InOpcode)
58907 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
58908
58909 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
58910 // -> EXTEND_VECTOR_INREG(X).
58911 // TODO: Handle non-zero subvector indices.
58912 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
58913 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
58914 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
58915 In.getValueSizeInBits())
58916 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
58917
58918 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
58919 // TODO: Move to DAGCombine?
58920 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
58921 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
58922 In.getValueSizeInBits() == VT.getSizeInBits()) {
58923 unsigned NumElts = VT.getVectorNumElements();
58924 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
58925 EVT EltVT = In.getOperand(0).getValueType();
58926 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
58927 for (unsigned I = 0; I != NumElts; ++I)
58928 Elts[I * Scale] = In.getOperand(I);
58929 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
58930 }
58931
58932 // Attempt to combine as a shuffle on SSE41+ targets.
58933 if (Subtarget.hasSSE41()) {
58934 SDValue Op(N, 0);
58935 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
58936 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
58937 return Res;
58938 }
58939
58940 return SDValue();
58941}
58942
58945 EVT VT = N->getValueType(0);
58946 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58947 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
58948 return DAG.getConstant(0, SDLoc(N), VT);
58949
58950 // Fold kshiftr(extract_subvector(X,C1),C2)
58951 // --> extract_subvector(kshiftr(X,C1+C2),0)
58952 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
58953 if (N->getOpcode() == X86ISD::KSHIFTR) {
58954 SDLoc DL(N);
58955 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
58956 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
58957 SDValue Src = N->getOperand(0).getOperand(0);
58958 uint64_t Amt = N->getConstantOperandVal(1) +
58959 N->getOperand(0).getConstantOperandVal(1);
58960 EVT SrcVT = Src.getValueType();
58961 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
58962 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
58963 DAG.getTargetConstant(Amt, DL, MVT::i8));
58964 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
58965 DAG.getVectorIdxConstant(0, DL));
58966 }
58967 }
58968 }
58969
58970 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
58971 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
58972 return SDValue(N, 0);
58973
58974 return SDValue();
58975}
58976
58977// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
58978// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
58979// extra instructions between the conversion due to going to scalar and back.
58981 const X86Subtarget &Subtarget) {
58982 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
58983 return SDValue();
58984
58985 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
58986 return SDValue();
58987
58988 if (N->getValueType(0) != MVT::f32 ||
58989 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
58990 return SDValue();
58991
58992 SDLoc dl(N);
58993 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
58994 N->getOperand(0).getOperand(0));
58995 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
58996 DAG.getTargetConstant(4, dl, MVT::i32));
58997 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
58998 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
58999 DAG.getVectorIdxConstant(0, dl));
59000}
59001
59004 const X86Subtarget &Subtarget) {
59005 EVT VT = N->getValueType(0);
59006 bool IsStrict = N->isStrictFPOpcode();
59007 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59008 EVT SrcVT = Src.getValueType();
59009
59010 SDLoc dl(N);
59011 if (SrcVT.getScalarType() == MVT::bf16) {
59012 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
59013 !IsStrict && Src.getOperand(0).getValueType() == VT)
59014 return Src.getOperand(0);
59015
59016 if (!SrcVT.isVector())
59017 return SDValue();
59018
59019 assert(!IsStrict && "Strict FP doesn't support BF16");
59020 if (VT.getVectorElementType() == MVT::f64) {
59021 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
59022 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
59023 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
59024 }
59025 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
59026 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
59027 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
59028 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
59029 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
59030 return DAG.getBitcast(VT, Src);
59031 }
59032
59033 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
59034 return SDValue();
59035
59036 if (Subtarget.hasFP16())
59037 return SDValue();
59038
59039 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
59040 return SDValue();
59041
59042 if (VT.getVectorElementType() != MVT::f32 &&
59043 VT.getVectorElementType() != MVT::f64)
59044 return SDValue();
59045
59046 unsigned NumElts = VT.getVectorNumElements();
59047 if (NumElts == 1 || !isPowerOf2_32(NumElts))
59048 return SDValue();
59049
59050 // Convert the input to vXi16.
59051 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
59052 Src = DAG.getBitcast(IntVT, Src);
59053
59054 // Widen to at least 8 input elements.
59055 if (NumElts < 8) {
59056 unsigned NumConcats = 8 / NumElts;
59057 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
59058 : DAG.getConstant(0, dl, IntVT);
59059 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
59060 Ops[0] = Src;
59061 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
59062 }
59063
59064 // Destination is vXf32 with at least 4 elements.
59065 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
59066 std::max(4U, NumElts));
59067 SDValue Cvt, Chain;
59068 if (IsStrict) {
59069 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
59070 {N->getOperand(0), Src});
59071 Chain = Cvt.getValue(1);
59072 } else {
59073 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
59074 }
59075
59076 if (NumElts < 4) {
59077 assert(NumElts == 2 && "Unexpected size");
59078 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
59079 DAG.getVectorIdxConstant(0, dl));
59080 }
59081
59082 if (IsStrict) {
59083 // Extend to the original VT if necessary.
59084 if (Cvt.getValueType() != VT) {
59085 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
59086 {Chain, Cvt});
59087 Chain = Cvt.getValue(1);
59088 }
59089 return DAG.getMergeValues({Cvt, Chain}, dl);
59090 }
59091
59092 // Extend to the original VT if necessary.
59093 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
59094}
59095
59096// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
59097// from. Limit this to cases where the loads have the same input chain and the
59098// output chains are unused. This avoids any memory ordering issues.
59101 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
59102 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
59103 "Unknown broadcast load type");
59104
59105 // Only do this if the chain result is unused.
59106 if (N->hasAnyUseOfValue(1))
59107 return SDValue();
59108
59109 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
59110
59111 SDValue Ptr = MemIntrin->getBasePtr();
59112 SDValue Chain = MemIntrin->getChain();
59113 EVT VT = N->getSimpleValueType(0);
59114 EVT MemVT = MemIntrin->getMemoryVT();
59115
59116 // Look at other users of our base pointer and try to find a wider broadcast.
59117 // The input chain and the size of the memory VT must match.
59118 for (SDNode *User : Ptr->users())
59119 if (User != N && User->getOpcode() == N->getOpcode() &&
59120 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
59121 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
59122 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
59123 MemVT.getSizeInBits() &&
59124 !User->hasAnyUseOfValue(1) &&
59125 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
59126 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
59127 VT.getSizeInBits());
59128 Extract = DAG.getBitcast(VT, Extract);
59129 return DCI.CombineTo(N, Extract, SDValue(User, 1));
59130 }
59131
59132 return SDValue();
59133}
59134
59136 const X86Subtarget &Subtarget) {
59137 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
59138 return SDValue();
59139
59140 bool IsStrict = N->isStrictFPOpcode();
59141 EVT VT = N->getValueType(0);
59142 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
59143 EVT SrcVT = Src.getValueType();
59144
59145 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
59146 SrcVT.getVectorElementType() != MVT::f32)
59147 return SDValue();
59148
59149 SDLoc dl(N);
59150
59151 SDValue Cvt, Chain;
59152 unsigned NumElts = VT.getVectorNumElements();
59153 if (Subtarget.hasFP16()) {
59154 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
59155 // v4f32 (xint_to_fp v4i64))))
59156 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
59157 // v8f16 (CVTXI2P v4i64)))
59158 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
59159 Src.getNumOperands() == 2) {
59160 SDValue Cvt0, Cvt1;
59161 SDValue Op0 = Src.getOperand(0);
59162 SDValue Op1 = Src.getOperand(1);
59163 bool IsOp0Strict = Op0->isStrictFPOpcode();
59164 if (Op0.getOpcode() != Op1.getOpcode() ||
59165 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
59166 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
59167 return SDValue();
59168 }
59169 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
59170 if (IsStrict) {
59171 assert(IsOp0Strict && "Op0 must be strict node");
59172 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
59175 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
59176 {Op0.getOperand(0), Op0.getOperand(1)});
59177 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
59178 {Op1.getOperand(0), Op1.getOperand(1)});
59179 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
59180 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
59181 }
59182 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
59184 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
59185 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
59186 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
59187 }
59188 return SDValue();
59189 }
59190
59191 if (NumElts == 1 || !isPowerOf2_32(NumElts))
59192 return SDValue();
59193
59194 // Widen to at least 4 input elements.
59195 if (NumElts < 4)
59196 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
59197 DAG.getConstantFP(0.0, dl, SrcVT));
59198
59199 // Destination is v8i16 with at least 8 elements.
59200 EVT CvtVT =
59201 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
59202 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
59203 if (IsStrict) {
59204 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
59205 {N->getOperand(0), Src, Rnd});
59206 Chain = Cvt.getValue(1);
59207 } else {
59208 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
59209 }
59210
59211 // Extract down to real number of elements.
59212 if (NumElts < 8) {
59214 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
59215 DAG.getVectorIdxConstant(0, dl));
59216 }
59217
59218 Cvt = DAG.getBitcast(VT, Cvt);
59219
59220 if (IsStrict)
59221 return DAG.getMergeValues({Cvt, Chain}, dl);
59222
59223 return Cvt;
59224}
59225
59227 SDValue Src = N->getOperand(0);
59228
59229 // Turn MOVDQ2Q+simple_load into an mmx load.
59230 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
59231 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
59232
59233 if (LN->isSimple()) {
59234 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
59235 LN->getBasePtr(),
59236 LN->getPointerInfo(),
59237 LN->getOriginalAlign(),
59238 LN->getMemOperand()->getFlags());
59239 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
59240 return NewLd;
59241 }
59242 }
59243
59244 return SDValue();
59245}
59246
59249 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
59250 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59251 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
59252 return SDValue(N, 0);
59253
59254 return SDValue();
59255}
59256
59257// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
59258// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
59259// use x86mmx instead.
59261 SDLoc dl(N);
59262
59263 bool MadeChange = false, CastReturnVal = false;
59265 for (const SDValue &Arg : N->op_values()) {
59266 if (Arg.getValueType() == MVT::v1i64) {
59267 MadeChange = true;
59268 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
59269 } else
59270 Args.push_back(Arg);
59271 }
59272 SDVTList VTs = N->getVTList();
59273 SDVTList NewVTs = VTs;
59274 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
59275 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
59276 NewVTArr[0] = MVT::x86mmx;
59277 NewVTs = DAG.getVTList(NewVTArr);
59278 MadeChange = true;
59279 CastReturnVal = true;
59280 }
59281
59282 if (MadeChange) {
59283 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
59284 if (CastReturnVal) {
59286 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
59287 Returns.push_back(Result.getValue(i));
59288 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
59289 return DAG.getMergeValues(Returns, dl);
59290 }
59291 return Result;
59292 }
59293 return SDValue();
59294}
59297 if (!DCI.isBeforeLegalize())
59298 return SDValue();
59299
59300 unsigned IntNo = N->getConstantOperandVal(0);
59301 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
59302
59303 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59304 return FixupMMXIntrinsicTypes(N, DAG);
59305
59306 return SDValue();
59307}
59308
59311 if (!DCI.isBeforeLegalize())
59312 return SDValue();
59313
59314 unsigned IntNo = N->getConstantOperandVal(1);
59315 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
59316
59317 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59318 return FixupMMXIntrinsicTypes(N, DAG);
59319
59320 return SDValue();
59321}
59322
59325 if (!DCI.isBeforeLegalize())
59326 return SDValue();
59327
59328 unsigned IntNo = N->getConstantOperandVal(1);
59329 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
59330
59331 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
59332 return FixupMMXIntrinsicTypes(N, DAG);
59333
59334 return SDValue();
59335}
59336
59338 DAGCombinerInfo &DCI) const {
59339 SelectionDAG &DAG = DCI.DAG;
59340 switch (N->getOpcode()) {
59341 // clang-format off
59342 default: break;
59344 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
59346 case X86ISD::PEXTRW:
59347 case X86ISD::PEXTRB:
59348 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
59350 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
59352 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
59354 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
59355 case ISD::VSELECT:
59356 case ISD::SELECT:
59357 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
59358 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
59359 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
59360 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
59361 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
59362 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
59363 case X86ISD::ADD:
59364 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
59365 case X86ISD::CLOAD:
59366 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
59367 case X86ISD::SBB: return combineSBB(N, DAG);
59368 case X86ISD::ADC: return combineADC(N, DAG, DCI);
59369 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
59370 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
59371 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
59372 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
59373 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
59374 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
59375 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
59376 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
59377 case ISD::AVGCEILS:
59378 case ISD::AVGCEILU:
59379 case ISD::AVGFLOORS:
59380 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
59381 case X86ISD::BEXTR:
59382 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
59383 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
59384 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
59385 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
59386 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
59388 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
59389 case ISD::SINT_TO_FP:
59391 return combineSIntToFP(N, DAG, DCI, Subtarget);
59392 case ISD::UINT_TO_FP:
59394 return combineUIntToFP(N, DAG, Subtarget);
59395 case ISD::LRINT:
59396 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
59397 case ISD::FADD:
59398 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
59399 case X86ISD::VFCMULC:
59400 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
59401 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
59402 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
59403 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
59404 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
59405 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
59406 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
59407 case X86ISD::FXOR:
59408 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
59409 case X86ISD::FMIN:
59410 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
59411 case ISD::FMINNUM:
59412 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
59413 case X86ISD::CVTSI2P:
59414 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
59415 case X86ISD::CVTP2SI:
59416 case X86ISD::CVTP2UI:
59418 case X86ISD::CVTTP2SI:
59420 case X86ISD::CVTTP2UI:
59421 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
59423 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
59424 case X86ISD::BT: return combineBT(N, DAG, DCI);
59425 case ISD::ANY_EXTEND:
59426 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
59427 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
59428 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
59432 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
59433 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
59434 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
59435 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
59436 case X86ISD::PACKSS:
59437 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
59438 case X86ISD::HADD:
59439 case X86ISD::HSUB:
59440 case X86ISD::FHADD:
59441 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
59442 case X86ISD::VSHL:
59443 case X86ISD::VSRA:
59444 case X86ISD::VSRL:
59445 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
59446 case X86ISD::VSHLI:
59447 case X86ISD::VSRAI:
59448 case X86ISD::VSRLI:
59449 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
59451 case X86ISD::PINSRB:
59452 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
59453 case X86ISD::SHUFP: // Handle all target specific shuffles
59454 case X86ISD::INSERTPS:
59455 case X86ISD::EXTRQI:
59456 case X86ISD::INSERTQI:
59457 case X86ISD::VALIGN:
59458 case X86ISD::PALIGNR:
59459 case X86ISD::VSHLDQ:
59460 case X86ISD::VSRLDQ:
59461 case X86ISD::BLENDI:
59462 case X86ISD::UNPCKH:
59463 case X86ISD::UNPCKL:
59464 case X86ISD::MOVHLPS:
59465 case X86ISD::MOVLHPS:
59466 case X86ISD::PSHUFB:
59467 case X86ISD::PSHUFD:
59468 case X86ISD::PSHUFHW:
59469 case X86ISD::PSHUFLW:
59470 case X86ISD::MOVSHDUP:
59471 case X86ISD::MOVSLDUP:
59472 case X86ISD::MOVDDUP:
59473 case X86ISD::MOVSS:
59474 case X86ISD::MOVSD:
59475 case X86ISD::MOVSH:
59476 case X86ISD::VBROADCAST:
59477 case X86ISD::VPPERM:
59478 case X86ISD::VPERMI:
59479 case X86ISD::VPERMV:
59480 case X86ISD::VPERMV3:
59481 case X86ISD::VPERMIL2:
59482 case X86ISD::VPERMILPI:
59483 case X86ISD::VPERMILPV:
59484 case X86ISD::VPERM2X128:
59485 case X86ISD::SHUF128:
59486 case X86ISD::VZEXT_MOVL:
59487 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
59488 case X86ISD::FMADD_RND:
59489 case X86ISD::FMSUB:
59491 case X86ISD::FMSUB_RND:
59492 case X86ISD::FNMADD:
59494 case X86ISD::FNMADD_RND:
59495 case X86ISD::FNMSUB:
59497 case X86ISD::FNMSUB_RND:
59498 case ISD::FMA:
59499 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
59502 case X86ISD::FMADDSUB:
59503 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
59504 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
59505 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
59506 case X86ISD::MGATHER:
59507 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
59508 case ISD::MGATHER:
59509 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
59510 case X86ISD::PCMPEQ:
59511 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
59512 case X86ISD::PMULDQ:
59513 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
59514 case X86ISD::VPMADDUBSW:
59515 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
59516 case X86ISD::KSHIFTL:
59517 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
59518 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
59520 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
59522 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
59524 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
59525 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
59526 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
59527 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
59528 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
59529 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
59531 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
59532 // clang-format on
59533 }
59534
59535 return SDValue();
59536}
59537
59539 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
59540}
59541
59542// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
59544 EVT ExtVT) const {
59545 return Subtarget.hasAVX512() || !VT.isVector();
59546}
59547
59548bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
59549 if (!isTypeLegal(VT))
59550 return false;
59551
59552 // There are no vXi8 shifts.
59553 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
59554 return false;
59555
59556 // TODO: Almost no 8-bit ops are desirable because they have no actual
59557 // size/speed advantages vs. 32-bit ops, but they do have a major
59558 // potential disadvantage by causing partial register stalls.
59559 //
59560 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
59561 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
59562 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
59563 // check for a constant operand to the multiply.
59564 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
59565 return false;
59566
59567 // i16 instruction encodings are longer and some i16 instructions are slow,
59568 // so those are not desirable.
59569 if (VT == MVT::i16) {
59570 switch (Opc) {
59571 default:
59572 break;
59573 case ISD::LOAD:
59574 case ISD::SIGN_EXTEND:
59575 case ISD::ZERO_EXTEND:
59576 case ISD::ANY_EXTEND:
59577 case ISD::MUL:
59578 return false;
59579 case ISD::SHL:
59580 case ISD::SRA:
59581 case ISD::SRL:
59582 case ISD::SUB:
59583 case ISD::ADD:
59584 case ISD::AND:
59585 case ISD::OR:
59586 case ISD::XOR:
59587 // NDD instruction never has "partial register write" issue b/c it has
59588 // destination register's upper bits [63:OSIZE]) zeroed even when
59589 // OSIZE=8/16.
59590 return Subtarget.hasNDD();
59591 }
59592 }
59593
59594 // Any legal type not explicitly accounted for above here is desirable.
59595 return true;
59596}
59597
59600 int JTI,
59601 SelectionDAG &DAG) const {
59602 const Module *M = DAG.getMachineFunction().getFunction().getParent();
59603 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
59604 if (IsCFProtectionSupported) {
59605 // In case control-flow branch protection is enabled, we need to add
59606 // notrack prefix to the indirect branch.
59607 // In order to do that we create NT_BRIND SDNode.
59608 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
59609 SDValue Chain = Value;
59610 // Jump table debug info is only needed if CodeView is enabled.
59612 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
59613 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
59614 }
59615
59616 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
59617}
59618
59621 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
59623 EVT VT = LogicOp->getValueType(0);
59624 EVT OpVT = SETCC0->getOperand(0).getValueType();
59625 if (!VT.isInteger())
59627
59628 if (VT.isVector())
59633
59634 // Don't use `NotAnd` as even though `not` is generally shorter code size than
59635 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
59636 // `NotAnd` applies, `AddAnd` does as well.
59637 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
59638 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
59640}
59641
59643 EVT VT = Op.getValueType();
59644 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
59645 isa<ConstantSDNode>(Op.getOperand(1));
59646
59647 // i16 is legal, but undesirable since i16 instruction encodings are longer
59648 // and some i16 instructions are slow.
59649 // 8-bit multiply-by-constant can usually be expanded to something cheaper
59650 // using LEA and/or other ALU ops.
59651 if (VT != MVT::i16 && !Is8BitMulByConstant)
59652 return false;
59653
59654 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
59655 if (!Op.hasOneUse())
59656 return false;
59657 SDNode *User = *Op->user_begin();
59659 return false;
59660 auto *Ld = cast<LoadSDNode>(Load);
59661 auto *St = cast<StoreSDNode>(User);
59662 return Ld->getBasePtr() == St->getBasePtr();
59663 };
59664
59665 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
59666 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
59667 return false;
59668 if (!Op.hasOneUse())
59669 return false;
59670 SDNode *User = *Op->user_begin();
59671 if (User->getOpcode() != ISD::ATOMIC_STORE)
59672 return false;
59673 auto *Ld = cast<AtomicSDNode>(Load);
59674 auto *St = cast<AtomicSDNode>(User);
59675 return Ld->getBasePtr() == St->getBasePtr();
59676 };
59677
59678 auto IsFoldableZext = [](SDValue Op) {
59679 if (!Op.hasOneUse())
59680 return false;
59681 SDNode *User = *Op->user_begin();
59682 EVT VT = User->getValueType(0);
59683 return (User->getOpcode() == ISD::ZERO_EXTEND &&
59684 (VT == MVT::i32 || VT == MVT::i64));
59685 };
59686
59687 bool Commute = false;
59688 switch (Op.getOpcode()) {
59689 default: return false;
59690 case ISD::SIGN_EXTEND:
59691 case ISD::ZERO_EXTEND:
59692 case ISD::ANY_EXTEND:
59693 break;
59694 case ISD::SHL:
59695 case ISD::SRA:
59696 case ISD::SRL: {
59697 SDValue N0 = Op.getOperand(0);
59698 // Look out for (store (shl (load), x)).
59699 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
59700 return false;
59701 break;
59702 }
59703 case ISD::MUL:
59704 // When ZU is enabled, we prefer to not promote for MUL by a constant
59705 // when there is an opportunity to fold a zext with imulzu.
59706 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
59707 (isa<ConstantSDNode>(Op.getOperand(0)) ||
59708 isa<ConstantSDNode>(Op.getOperand(1))))
59709 return false;
59710 [[fallthrough]];
59711 case ISD::ADD:
59712 case ISD::AND:
59713 case ISD::OR:
59714 case ISD::XOR:
59715 Commute = true;
59716 [[fallthrough]];
59717 case ISD::SUB: {
59718 SDValue N0 = Op.getOperand(0);
59719 SDValue N1 = Op.getOperand(1);
59720 // Avoid disabling potential load folding opportunities.
59721 if (X86::mayFoldLoad(N1, Subtarget) &&
59722 (!Commute || !isa<ConstantSDNode>(N0) ||
59723 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
59724 return false;
59725 if (X86::mayFoldLoad(N0, Subtarget) &&
59726 ((Commute && !isa<ConstantSDNode>(N1)) ||
59727 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
59728 return false;
59729 if (IsFoldableAtomicRMW(N0, Op) ||
59730 (Commute && IsFoldableAtomicRMW(N1, Op)))
59731 return false;
59732 }
59733 }
59734
59735 PVT = MVT::i32;
59736 return true;
59737}
59738
59739//===----------------------------------------------------------------------===//
59740// X86 Inline Assembly Support
59741//===----------------------------------------------------------------------===//
59742
59743// Helper to match a string separated by whitespace.
59745 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
59746
59747 for (StringRef Piece : Pieces) {
59748 if (!S.starts_with(Piece)) // Check if the piece matches.
59749 return false;
59750
59751 S = S.substr(Piece.size());
59753 if (Pos == 0) // We matched a prefix.
59754 return false;
59755
59756 S = S.substr(Pos);
59757 }
59758
59759 return S.empty();
59760}
59761
59763
59764 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
59765 if (llvm::is_contained(AsmPieces, "~{cc}") &&
59766 llvm::is_contained(AsmPieces, "~{flags}") &&
59767 llvm::is_contained(AsmPieces, "~{fpsr}")) {
59768
59769 if (AsmPieces.size() == 3)
59770 return true;
59771 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
59772 return true;
59773 }
59774 }
59775 return false;
59776}
59777
59779 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
59780
59781 const std::string &AsmStr = IA->getAsmString();
59782
59783 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
59784 if (!Ty || Ty->getBitWidth() % 16 != 0)
59785 return false;
59786
59787 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
59788 SmallVector<StringRef, 4> AsmPieces;
59789 SplitString(AsmStr, AsmPieces, ";\n");
59790
59791 switch (AsmPieces.size()) {
59792 default: return false;
59793 case 1:
59794 // FIXME: this should verify that we are targeting a 486 or better. If not,
59795 // we will turn this bswap into something that will be lowered to logical
59796 // ops instead of emitting the bswap asm. For now, we don't support 486 or
59797 // lower so don't worry about this.
59798 // bswap $0
59799 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
59800 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
59801 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
59802 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
59803 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
59804 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
59805 // No need to check constraints, nothing other than the equivalent of
59806 // "=r,0" would be valid here.
59808 }
59809
59810 // rorw $$8, ${0:w} --> llvm.bswap.i16
59811 if (CI->getType()->isIntegerTy(16) &&
59812 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59813 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
59814 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
59815 AsmPieces.clear();
59816 StringRef ConstraintsStr = IA->getConstraintString();
59817 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
59818 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
59819 if (clobbersFlagRegisters(AsmPieces))
59821 }
59822 break;
59823 case 3:
59824 if (CI->getType()->isIntegerTy(32) &&
59825 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
59826 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
59827 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
59828 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
59829 AsmPieces.clear();
59830 StringRef ConstraintsStr = IA->getConstraintString();
59831 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
59832 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
59833 if (clobbersFlagRegisters(AsmPieces))
59835 }
59836
59837 if (CI->getType()->isIntegerTy(64)) {
59838 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
59839 if (Constraints.size() >= 2 &&
59840 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
59841 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
59842 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
59843 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
59844 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
59845 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
59847 }
59848 }
59849 break;
59850 }
59851 return false;
59852}
59853
59856 .Case("{@cca}", X86::COND_A)
59857 .Case("{@ccae}", X86::COND_AE)
59858 .Case("{@ccb}", X86::COND_B)
59859 .Case("{@ccbe}", X86::COND_BE)
59860 .Case("{@ccc}", X86::COND_B)
59861 .Case("{@cce}", X86::COND_E)
59862 .Case("{@ccz}", X86::COND_E)
59863 .Case("{@ccg}", X86::COND_G)
59864 .Case("{@ccge}", X86::COND_GE)
59865 .Case("{@ccl}", X86::COND_L)
59866 .Case("{@ccle}", X86::COND_LE)
59867 .Case("{@ccna}", X86::COND_BE)
59868 .Case("{@ccnae}", X86::COND_B)
59869 .Case("{@ccnb}", X86::COND_AE)
59870 .Case("{@ccnbe}", X86::COND_A)
59871 .Case("{@ccnc}", X86::COND_AE)
59872 .Case("{@ccne}", X86::COND_NE)
59873 .Case("{@ccnz}", X86::COND_NE)
59874 .Case("{@ccng}", X86::COND_LE)
59875 .Case("{@ccnge}", X86::COND_L)
59876 .Case("{@ccnl}", X86::COND_GE)
59877 .Case("{@ccnle}", X86::COND_G)
59878 .Case("{@ccno}", X86::COND_NO)
59879 .Case("{@ccnp}", X86::COND_NP)
59880 .Case("{@ccns}", X86::COND_NS)
59881 .Case("{@cco}", X86::COND_O)
59882 .Case("{@ccp}", X86::COND_P)
59883 .Case("{@ccs}", X86::COND_S)
59885 return Cond;
59886}
59887
59888/// Given a constraint letter, return the type of constraint for this target.
59891 if (Constraint.size() == 1) {
59892 switch (Constraint[0]) {
59893 case 'R':
59894 case 'q':
59895 case 'Q':
59896 case 'f':
59897 case 't':
59898 case 'u':
59899 case 'y':
59900 case 'x':
59901 case 'v':
59902 case 'l':
59903 case 'k': // AVX512 masking registers.
59904 return C_RegisterClass;
59905 case 'a':
59906 case 'b':
59907 case 'c':
59908 case 'd':
59909 case 'S':
59910 case 'D':
59911 case 'A':
59912 return C_Register;
59913 case 'I':
59914 case 'J':
59915 case 'K':
59916 case 'N':
59917 case 'G':
59918 case 'L':
59919 case 'M':
59920 return C_Immediate;
59921 case 'C':
59922 case 'e':
59923 case 'Z':
59924 return C_Other;
59925 default:
59926 break;
59927 }
59928 }
59929 else if (Constraint.size() == 2) {
59930 switch (Constraint[0]) {
59931 default:
59932 break;
59933 case 'W':
59934 if (Constraint[1] != 's')
59935 break;
59936 return C_Other;
59937 case 'Y':
59938 switch (Constraint[1]) {
59939 default:
59940 break;
59941 case 'z':
59942 return C_Register;
59943 case 'i':
59944 case 'm':
59945 case 'k':
59946 case 't':
59947 case '2':
59948 return C_RegisterClass;
59949 }
59950 break;
59951 case 'j':
59952 switch (Constraint[1]) {
59953 default:
59954 break;
59955 case 'r':
59956 case 'R':
59957 return C_RegisterClass;
59958 }
59959 }
59960 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
59961 return C_Other;
59962 return TargetLowering::getConstraintType(Constraint);
59963}
59964
59965/// Examine constraint type and operand type and determine a weight value.
59966/// This object must already have been set up with the operand type
59967/// and the current alternative constraint selected.
59970 AsmOperandInfo &Info, const char *Constraint) const {
59972 Value *CallOperandVal = Info.CallOperandVal;
59973 // If we don't have a value, we can't do a match,
59974 // but allow it at the lowest weight.
59975 if (!CallOperandVal)
59976 return CW_Default;
59977 Type *Ty = CallOperandVal->getType();
59978 // Look at the constraint type.
59979 switch (*Constraint) {
59980 default:
59982 [[fallthrough]];
59983 case 'R':
59984 case 'q':
59985 case 'Q':
59986 case 'a':
59987 case 'b':
59988 case 'c':
59989 case 'd':
59990 case 'S':
59991 case 'D':
59992 case 'A':
59993 if (CallOperandVal->getType()->isIntegerTy())
59994 Wt = CW_SpecificReg;
59995 break;
59996 case 'f':
59997 case 't':
59998 case 'u':
59999 if (Ty->isFloatingPointTy())
60000 Wt = CW_SpecificReg;
60001 break;
60002 case 'y':
60003 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60004 Wt = CW_SpecificReg;
60005 break;
60006 case 'Y':
60007 if (StringRef(Constraint).size() != 2)
60008 break;
60009 switch (Constraint[1]) {
60010 default:
60011 return CW_Invalid;
60012 // XMM0
60013 case 'z':
60014 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60015 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
60016 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
60017 return CW_SpecificReg;
60018 return CW_Invalid;
60019 // Conditional OpMask regs (AVX512)
60020 case 'k':
60021 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60022 return CW_Register;
60023 return CW_Invalid;
60024 // Any MMX reg
60025 case 'm':
60026 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
60027 return CW_SpecificReg;
60028 return CW_Invalid;
60029 // Any SSE reg when ISA >= SSE2, same as 'x'
60030 case 'i':
60031 case 't':
60032 case '2':
60033 if (!Subtarget.hasSSE2())
60034 return CW_Invalid;
60035 break;
60036 }
60037 break;
60038 case 'j':
60039 if (StringRef(Constraint).size() != 2)
60040 break;
60041 switch (Constraint[1]) {
60042 default:
60043 return CW_Invalid;
60044 case 'r':
60045 case 'R':
60046 if (CallOperandVal->getType()->isIntegerTy())
60047 Wt = CW_SpecificReg;
60048 break;
60049 }
60050 break;
60051 case 'v':
60052 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
60053 Wt = CW_Register;
60054 [[fallthrough]];
60055 case 'x':
60056 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
60057 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
60058 Wt = CW_Register;
60059 break;
60060 case 'k':
60061 // Enable conditional vector operations using %k<#> registers.
60062 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
60063 Wt = CW_Register;
60064 break;
60065 case 'I':
60066 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
60067 if (C->getZExtValue() <= 31)
60068 Wt = CW_Constant;
60069 break;
60070 case 'J':
60071 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60072 if (C->getZExtValue() <= 63)
60073 Wt = CW_Constant;
60074 break;
60075 case 'K':
60076 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60077 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
60078 Wt = CW_Constant;
60079 break;
60080 case 'L':
60081 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60082 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
60083 Wt = CW_Constant;
60084 break;
60085 case 'M':
60086 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60087 if (C->getZExtValue() <= 3)
60088 Wt = CW_Constant;
60089 break;
60090 case 'N':
60091 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60092 if (C->getZExtValue() <= 0xff)
60093 Wt = CW_Constant;
60094 break;
60095 case 'G':
60096 case 'C':
60097 if (isa<ConstantFP>(CallOperandVal))
60098 Wt = CW_Constant;
60099 break;
60100 case 'e':
60101 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60102 if ((C->getSExtValue() >= -0x80000000LL) &&
60103 (C->getSExtValue() <= 0x7fffffffLL))
60104 Wt = CW_Constant;
60105 break;
60106 case 'Z':
60107 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
60108 if (C->getZExtValue() <= 0xffffffff)
60109 Wt = CW_Constant;
60110 break;
60111 }
60112 return Wt;
60113}
60114
60115/// Try to replace an X constraint, which matches anything, with another that
60116/// has more specific requirements based on the type of the corresponding
60117/// operand.
60119LowerXConstraint(EVT ConstraintVT) const {
60120 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
60121 // 'f' like normal targets.
60122 if (ConstraintVT.isFloatingPoint()) {
60123 if (Subtarget.hasSSE1())
60124 return "x";
60125 }
60126
60127 return TargetLowering::LowerXConstraint(ConstraintVT);
60128}
60129
60130// Lower @cc targets via setcc.
60132 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
60133 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
60135 if (Cond == X86::COND_INVALID)
60136 return SDValue();
60137 // Check that return type is valid.
60138 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
60139 OpInfo.ConstraintVT.getSizeInBits() < 8)
60140 report_fatal_error("Glue output operand is of invalid type");
60141
60142 // Get EFLAGS register. Only update chain when copyfrom is glued.
60143 if (Glue.getNode()) {
60144 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
60145 Chain = Glue.getValue(1);
60146 } else
60147 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
60148 // Extract CC code.
60149 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
60150 // Extend to 32-bits
60151 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
60152
60153 return Result;
60154}
60155
60156/// Lower the specified operand into the Ops vector.
60157/// If it is invalid, don't add anything to Ops.
60159 StringRef Constraint,
60160 std::vector<SDValue> &Ops,
60161 SelectionDAG &DAG) const {
60162 SDValue Result;
60163 char ConstraintLetter = Constraint[0];
60164 switch (ConstraintLetter) {
60165 default: break;
60166 case 'I':
60167 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60168 if (C->getZExtValue() <= 31) {
60169 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60170 Op.getValueType());
60171 break;
60172 }
60173 }
60174 return;
60175 case 'J':
60176 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60177 if (C->getZExtValue() <= 63) {
60178 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60179 Op.getValueType());
60180 break;
60181 }
60182 }
60183 return;
60184 case 'K':
60185 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60186 if (isInt<8>(C->getSExtValue())) {
60187 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60188 Op.getValueType());
60189 break;
60190 }
60191 }
60192 return;
60193 case 'L':
60194 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60195 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
60196 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
60197 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
60198 Op.getValueType());
60199 break;
60200 }
60201 }
60202 return;
60203 case 'M':
60204 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60205 if (C->getZExtValue() <= 3) {
60206 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60207 Op.getValueType());
60208 break;
60209 }
60210 }
60211 return;
60212 case 'N':
60213 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60214 if (C->getZExtValue() <= 255) {
60215 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60216 Op.getValueType());
60217 break;
60218 }
60219 }
60220 return;
60221 case 'O':
60222 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60223 if (C->getZExtValue() <= 127) {
60224 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60225 Op.getValueType());
60226 break;
60227 }
60228 }
60229 return;
60230 case 'e': {
60231 // 32-bit signed value
60232 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60234 C->getSExtValue())) {
60235 // Widen to 64 bits here to get it sign extended.
60236 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
60237 break;
60238 }
60239 // FIXME gcc accepts some relocatable values here too, but only in certain
60240 // memory models; it's complicated.
60241 }
60242 return;
60243 }
60244 case 'W': {
60245 assert(Constraint[1] == 's');
60246 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
60247 // offset.
60248 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
60249 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
60250 BA->getValueType(0)));
60251 } else {
60252 int64_t Offset = 0;
60253 if (Op->getOpcode() == ISD::ADD &&
60254 isa<ConstantSDNode>(Op->getOperand(1))) {
60255 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
60256 Op = Op->getOperand(0);
60257 }
60258 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
60259 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
60260 GA->getValueType(0), Offset));
60261 }
60262 return;
60263 }
60264 case 'Z': {
60265 // 32-bit unsigned value
60266 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
60268 C->getZExtValue())) {
60269 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
60270 Op.getValueType());
60271 break;
60272 }
60273 }
60274 // FIXME gcc accepts some relocatable values here too, but only in certain
60275 // memory models; it's complicated.
60276 return;
60277 }
60278 case 'i': {
60279 // Literal immediates are always ok.
60280 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
60281 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
60282 BooleanContent BCont = getBooleanContents(MVT::i64);
60283 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
60285 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
60286 : CST->getSExtValue();
60287 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
60288 break;
60289 }
60290
60291 // In any sort of PIC mode addresses need to be computed at runtime by
60292 // adding in a register or some sort of table lookup. These can't
60293 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
60294 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
60295 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
60296 return;
60297
60298 // If we are in non-pic codegen mode, we allow the address of a global (with
60299 // an optional displacement) to be used with 'i'.
60300 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
60301 // If we require an extra load to get this address, as in PIC mode, we
60302 // can't accept it.
60304 Subtarget.classifyGlobalReference(GA->getGlobal())))
60305 return;
60306 break;
60307 }
60308 }
60309
60310 if (Result.getNode()) {
60311 Ops.push_back(Result);
60312 return;
60313 }
60314 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
60315}
60316
60317/// Check if \p RC is a general purpose register class.
60318/// I.e., GR* or one of their variant.
60319static bool isGRClass(const TargetRegisterClass &RC) {
60320 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
60321 RC.hasSuperClassEq(&X86::GR16RegClass) ||
60322 RC.hasSuperClassEq(&X86::GR32RegClass) ||
60323 RC.hasSuperClassEq(&X86::GR64RegClass) ||
60324 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
60325}
60326
60327/// Check if \p RC is a vector register class.
60328/// I.e., FR* / VR* or one of their variant.
60329static bool isFRClass(const TargetRegisterClass &RC) {
60330 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
60331 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
60332 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
60333 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
60334 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
60335 RC.hasSuperClassEq(&X86::VR512RegClass);
60336}
60337
60338/// Check if \p RC is a mask register class.
60339/// I.e., VK* or one of their variant.
60340static bool isVKClass(const TargetRegisterClass &RC) {
60341 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
60342 RC.hasSuperClassEq(&X86::VK2RegClass) ||
60343 RC.hasSuperClassEq(&X86::VK4RegClass) ||
60344 RC.hasSuperClassEq(&X86::VK8RegClass) ||
60345 RC.hasSuperClassEq(&X86::VK16RegClass) ||
60346 RC.hasSuperClassEq(&X86::VK32RegClass) ||
60347 RC.hasSuperClassEq(&X86::VK64RegClass);
60348}
60349
60350static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
60351 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
60352}
60353
60354std::pair<unsigned, const TargetRegisterClass *>
60356 StringRef Constraint,
60357 MVT VT) const {
60358 // First, see if this is a constraint that directly corresponds to an LLVM
60359 // register class.
60360 if (Constraint.size() == 1) {
60361 // GCC Constraint Letters
60362 switch (Constraint[0]) {
60363 default: break;
60364 // 'A' means [ER]AX + [ER]DX.
60365 case 'A':
60366 if (Subtarget.is64Bit())
60367 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
60368 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
60369 "Expecting 64, 32 or 16 bit subtarget");
60370 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
60371
60372 // TODO: Slight differences here in allocation order and leaving
60373 // RIP in the class. Do they matter any more here than they do
60374 // in the normal allocation?
60375 case 'k':
60376 if (Subtarget.hasAVX512()) {
60377 if (VT == MVT::v1i1 || VT == MVT::i1)
60378 return std::make_pair(0U, &X86::VK1RegClass);
60379 if (VT == MVT::v8i1 || VT == MVT::i8)
60380 return std::make_pair(0U, &X86::VK8RegClass);
60381 if (VT == MVT::v16i1 || VT == MVT::i16)
60382 return std::make_pair(0U, &X86::VK16RegClass);
60383 }
60384 if (Subtarget.hasBWI()) {
60385 if (VT == MVT::v32i1 || VT == MVT::i32)
60386 return std::make_pair(0U, &X86::VK32RegClass);
60387 if (VT == MVT::v64i1 || VT == MVT::i64)
60388 return std::make_pair(0U, &X86::VK64RegClass);
60389 }
60390 break;
60391 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
60392 if (Subtarget.is64Bit()) {
60393 if (VT == MVT::i8 || VT == MVT::i1)
60394 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60395 ? &X86::GR8RegClass
60396 : &X86::GR8_NOREX2RegClass);
60397 if (VT == MVT::i16)
60398 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60399 ? &X86::GR16RegClass
60400 : &X86::GR16_NOREX2RegClass);
60401 if (VT == MVT::i32 || VT == MVT::f32)
60402 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60403 ? &X86::GR32RegClass
60404 : &X86::GR32_NOREX2RegClass);
60405 if (VT != MVT::f80 && !VT.isVector())
60406 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60407 ? &X86::GR64RegClass
60408 : &X86::GR64_NOREX2RegClass);
60409 break;
60410 }
60411 [[fallthrough]];
60412 // 32-bit fallthrough
60413 case 'Q': // Q_REGS
60414 if (VT == MVT::i8 || VT == MVT::i1)
60415 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
60416 if (VT == MVT::i16)
60417 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
60418 if (VT == MVT::i32 || VT == MVT::f32 ||
60419 (!VT.isVector() && !Subtarget.is64Bit()))
60420 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
60421 if (VT != MVT::f80 && !VT.isVector())
60422 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
60423 break;
60424 case 'r': // GENERAL_REGS
60425 case 'l': // INDEX_REGS
60426 if (VT == MVT::i8 || VT == MVT::i1)
60427 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60428 ? &X86::GR8RegClass
60429 : &X86::GR8_NOREX2RegClass);
60430 if (VT == MVT::i16)
60431 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60432 ? &X86::GR16RegClass
60433 : &X86::GR16_NOREX2RegClass);
60434 if (VT == MVT::i32 || VT == MVT::f32 ||
60435 (!VT.isVector() && !Subtarget.is64Bit()))
60436 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60437 ? &X86::GR32RegClass
60438 : &X86::GR32_NOREX2RegClass);
60439 if (VT != MVT::f80 && !VT.isVector())
60440 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
60441 ? &X86::GR64RegClass
60442 : &X86::GR64_NOREX2RegClass);
60443 break;
60444 case 'R': // LEGACY_REGS
60445 if (VT == MVT::i8 || VT == MVT::i1)
60446 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
60447 if (VT == MVT::i16)
60448 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
60449 if (VT == MVT::i32 || VT == MVT::f32 ||
60450 (!VT.isVector() && !Subtarget.is64Bit()))
60451 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
60452 if (VT != MVT::f80 && !VT.isVector())
60453 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
60454 break;
60455 case 'f': // FP Stack registers.
60456 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
60457 // value to the correct fpstack register class.
60458 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
60459 return std::make_pair(0U, &X86::RFP32RegClass);
60460 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
60461 return std::make_pair(0U, &X86::RFP64RegClass);
60462 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
60463 return std::make_pair(0U, &X86::RFP80RegClass);
60464 break;
60465 case 'y': // MMX_REGS if MMX allowed.
60466 if (!Subtarget.hasMMX()) break;
60467 return std::make_pair(0U, &X86::VR64RegClass);
60468 case 'v':
60469 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
60470 if (!Subtarget.hasSSE1()) break;
60471 bool VConstraint = (Constraint[0] == 'v');
60472
60473 switch (VT.SimpleTy) {
60474 default: break;
60475 // Scalar SSE types.
60476 case MVT::f16:
60477 if (VConstraint && Subtarget.hasFP16())
60478 return std::make_pair(0U, &X86::FR16XRegClass);
60479 break;
60480 case MVT::f32:
60481 case MVT::i32:
60482 if (VConstraint && Subtarget.hasVLX())
60483 return std::make_pair(0U, &X86::FR32XRegClass);
60484 return std::make_pair(0U, &X86::FR32RegClass);
60485 case MVT::f64:
60486 case MVT::i64:
60487 if (VConstraint && Subtarget.hasVLX())
60488 return std::make_pair(0U, &X86::FR64XRegClass);
60489 return std::make_pair(0U, &X86::FR64RegClass);
60490 case MVT::i128:
60491 if (Subtarget.is64Bit()) {
60492 if (VConstraint && Subtarget.hasVLX())
60493 return std::make_pair(0U, &X86::VR128XRegClass);
60494 return std::make_pair(0U, &X86::VR128RegClass);
60495 }
60496 break;
60497 // Vector types and fp128.
60498 case MVT::v8f16:
60499 if (!Subtarget.hasFP16())
60500 break;
60501 if (VConstraint)
60502 return std::make_pair(0U, &X86::VR128XRegClass);
60503 return std::make_pair(0U, &X86::VR128RegClass);
60504 case MVT::v8bf16:
60505 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60506 break;
60507 if (VConstraint)
60508 return std::make_pair(0U, &X86::VR128XRegClass);
60509 return std::make_pair(0U, &X86::VR128RegClass);
60510 case MVT::f128:
60511 case MVT::v16i8:
60512 case MVT::v8i16:
60513 case MVT::v4i32:
60514 case MVT::v2i64:
60515 case MVT::v4f32:
60516 case MVT::v2f64:
60517 if (VConstraint && Subtarget.hasVLX())
60518 return std::make_pair(0U, &X86::VR128XRegClass);
60519 return std::make_pair(0U, &X86::VR128RegClass);
60520 // AVX types.
60521 case MVT::v16f16:
60522 if (!Subtarget.hasFP16())
60523 break;
60524 if (VConstraint)
60525 return std::make_pair(0U, &X86::VR256XRegClass);
60526 return std::make_pair(0U, &X86::VR256RegClass);
60527 case MVT::v16bf16:
60528 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60529 break;
60530 if (VConstraint)
60531 return std::make_pair(0U, &X86::VR256XRegClass);
60532 return std::make_pair(0U, &X86::VR256RegClass);
60533 case MVT::v32i8:
60534 case MVT::v16i16:
60535 case MVT::v8i32:
60536 case MVT::v4i64:
60537 case MVT::v8f32:
60538 case MVT::v4f64:
60539 if (VConstraint && Subtarget.hasVLX())
60540 return std::make_pair(0U, &X86::VR256XRegClass);
60541 if (Subtarget.hasAVX())
60542 return std::make_pair(0U, &X86::VR256RegClass);
60543 break;
60544 case MVT::v32f16:
60545 if (!Subtarget.hasFP16())
60546 break;
60547 if (VConstraint)
60548 return std::make_pair(0U, &X86::VR512RegClass);
60549 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60550 case MVT::v32bf16:
60551 if (!Subtarget.hasBF16())
60552 break;
60553 if (VConstraint)
60554 return std::make_pair(0U, &X86::VR512RegClass);
60555 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60556 case MVT::v64i8:
60557 case MVT::v32i16:
60558 case MVT::v8f64:
60559 case MVT::v16f32:
60560 case MVT::v16i32:
60561 case MVT::v8i64:
60562 if (!Subtarget.hasAVX512()) break;
60563 if (VConstraint)
60564 return std::make_pair(0U, &X86::VR512RegClass);
60565 return std::make_pair(0U, &X86::VR512_0_15RegClass);
60566 }
60567 break;
60568 }
60569 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
60570 switch (Constraint[1]) {
60571 default:
60572 break;
60573 case 'i':
60574 case 't':
60575 case '2':
60576 return getRegForInlineAsmConstraint(TRI, "x", VT);
60577 case 'm':
60578 if (!Subtarget.hasMMX()) break;
60579 return std::make_pair(0U, &X86::VR64RegClass);
60580 case 'z':
60581 if (!Subtarget.hasSSE1()) break;
60582 switch (VT.SimpleTy) {
60583 default: break;
60584 // Scalar SSE types.
60585 case MVT::f16:
60586 if (!Subtarget.hasFP16())
60587 break;
60588 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
60589 case MVT::f32:
60590 case MVT::i32:
60591 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
60592 case MVT::f64:
60593 case MVT::i64:
60594 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
60595 case MVT::v8f16:
60596 if (!Subtarget.hasFP16())
60597 break;
60598 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60599 case MVT::v8bf16:
60600 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60601 break;
60602 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60603 case MVT::f128:
60604 case MVT::v16i8:
60605 case MVT::v8i16:
60606 case MVT::v4i32:
60607 case MVT::v2i64:
60608 case MVT::v4f32:
60609 case MVT::v2f64:
60610 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
60611 // AVX types.
60612 case MVT::v16f16:
60613 if (!Subtarget.hasFP16())
60614 break;
60615 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60616 case MVT::v16bf16:
60617 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
60618 break;
60619 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60620 case MVT::v32i8:
60621 case MVT::v16i16:
60622 case MVT::v8i32:
60623 case MVT::v4i64:
60624 case MVT::v8f32:
60625 case MVT::v4f64:
60626 if (Subtarget.hasAVX())
60627 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
60628 break;
60629 case MVT::v32f16:
60630 if (!Subtarget.hasFP16())
60631 break;
60632 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60633 case MVT::v32bf16:
60634 if (!Subtarget.hasBF16())
60635 break;
60636 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60637 case MVT::v64i8:
60638 case MVT::v32i16:
60639 case MVT::v8f64:
60640 case MVT::v16f32:
60641 case MVT::v16i32:
60642 case MVT::v8i64:
60643 if (Subtarget.hasAVX512())
60644 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
60645 break;
60646 }
60647 break;
60648 case 'k':
60649 // This register class doesn't allocate k0 for masked vector operation.
60650 if (Subtarget.hasAVX512()) {
60651 if (VT == MVT::v1i1 || VT == MVT::i1)
60652 return std::make_pair(0U, &X86::VK1WMRegClass);
60653 if (VT == MVT::v8i1 || VT == MVT::i8)
60654 return std::make_pair(0U, &X86::VK8WMRegClass);
60655 if (VT == MVT::v16i1 || VT == MVT::i16)
60656 return std::make_pair(0U, &X86::VK16WMRegClass);
60657 }
60658 if (Subtarget.hasBWI()) {
60659 if (VT == MVT::v32i1 || VT == MVT::i32)
60660 return std::make_pair(0U, &X86::VK32WMRegClass);
60661 if (VT == MVT::v64i1 || VT == MVT::i64)
60662 return std::make_pair(0U, &X86::VK64WMRegClass);
60663 }
60664 break;
60665 }
60666 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
60667 switch (Constraint[1]) {
60668 default:
60669 break;
60670 case 'r':
60671 if (VT == MVT::i8 || VT == MVT::i1)
60672 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
60673 if (VT == MVT::i16)
60674 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
60675 if (VT == MVT::i32 || VT == MVT::f32)
60676 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
60677 if (VT != MVT::f80 && !VT.isVector())
60678 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
60679 break;
60680 case 'R':
60681 if (VT == MVT::i8 || VT == MVT::i1)
60682 return std::make_pair(0U, &X86::GR8RegClass);
60683 if (VT == MVT::i16)
60684 return std::make_pair(0U, &X86::GR16RegClass);
60685 if (VT == MVT::i32 || VT == MVT::f32)
60686 return std::make_pair(0U, &X86::GR32RegClass);
60687 if (VT != MVT::f80 && !VT.isVector())
60688 return std::make_pair(0U, &X86::GR64RegClass);
60689 break;
60690 }
60691 }
60692
60693 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
60694 return std::make_pair(0U, &X86::GR32RegClass);
60695
60696 // Use the default implementation in TargetLowering to convert the register
60697 // constraint into a member of a register class.
60698 std::pair<Register, const TargetRegisterClass*> Res;
60700
60701 // Not found as a standard register?
60702 if (!Res.second) {
60703 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
60704 // to/from f80.
60705 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
60706 // Map st(0) -> st(7) -> ST0
60707 if (Constraint.size() == 7 && Constraint[0] == '{' &&
60708 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
60709 Constraint[3] == '(' &&
60710 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
60711 Constraint[5] == ')' && Constraint[6] == '}') {
60712 // st(7) is not allocatable and thus not a member of RFP80. Return
60713 // singleton class in cases where we have a reference to it.
60714 if (Constraint[4] == '7')
60715 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
60716 return std::make_pair(X86::FP0 + Constraint[4] - '0',
60717 &X86::RFP80RegClass);
60718 }
60719
60720 // GCC allows "st(0)" to be called just plain "st".
60721 if (StringRef("{st}").equals_insensitive(Constraint))
60722 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
60723 }
60724
60725 // flags -> EFLAGS
60726 if (StringRef("{flags}").equals_insensitive(Constraint))
60727 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
60728
60729 // dirflag -> DF
60730 // Only allow for clobber.
60731 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
60732 VT == MVT::Other)
60733 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
60734
60735 // fpsr -> FPSW
60736 // Only allow for clobber.
60737 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
60738 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
60739
60740 return Res;
60741 }
60742
60743 // Make sure it isn't a register that requires 64-bit mode.
60744 if (!Subtarget.is64Bit() &&
60745 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
60746 TRI->getEncodingValue(Res.first) >= 8) {
60747 // Register requires REX prefix, but we're in 32-bit mode.
60748 return std::make_pair(0, nullptr);
60749 }
60750
60751 // Make sure it isn't a register that requires AVX512.
60752 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
60753 TRI->getEncodingValue(Res.first) & 0x10) {
60754 // Register requires EVEX prefix.
60755 return std::make_pair(0, nullptr);
60756 }
60757
60758 // Otherwise, check to see if this is a register class of the wrong value
60759 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
60760 // turn into {ax},{dx}.
60761 // MVT::Other is used to specify clobber names.
60762 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
60763 return Res; // Correct type already, nothing to do.
60764
60765 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
60766 // return "eax". This should even work for things like getting 64bit integer
60767 // registers when given an f64 type.
60768 const TargetRegisterClass *Class = Res.second;
60769 // The generic code will match the first register class that contains the
60770 // given register. Thus, based on the ordering of the tablegened file,
60771 // the "plain" GR classes might not come first.
60772 // Therefore, use a helper method.
60773 if (isGRClass(*Class)) {
60774 unsigned Size = VT.getSizeInBits();
60775 if (Size == 1) Size = 8;
60776 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
60777 return std::make_pair(0, nullptr);
60778 Register DestReg = getX86SubSuperRegister(Res.first, Size);
60779 if (DestReg.isValid()) {
60780 bool is64Bit = Subtarget.is64Bit();
60781 const TargetRegisterClass *RC =
60782 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
60783 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
60784 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
60785 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
60786 if (Size == 64 && !is64Bit) {
60787 // Model GCC's behavior here and select a fixed pair of 32-bit
60788 // registers.
60789 switch (DestReg) {
60790 case X86::RAX:
60791 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
60792 case X86::RDX:
60793 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
60794 case X86::RCX:
60795 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
60796 case X86::RBX:
60797 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
60798 case X86::RSI:
60799 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
60800 case X86::RDI:
60801 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
60802 case X86::RBP:
60803 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
60804 default:
60805 return std::make_pair(0, nullptr);
60806 }
60807 }
60808 if (RC && RC->contains(DestReg))
60809 return std::make_pair(DestReg, RC);
60810 return Res;
60811 }
60812 // No register found/type mismatch.
60813 return std::make_pair(0, nullptr);
60814 } else if (isFRClass(*Class)) {
60815 // Handle references to XMM physical registers that got mapped into the
60816 // wrong class. This can happen with constraints like {xmm0} where the
60817 // target independent register mapper will just pick the first match it can
60818 // find, ignoring the required type.
60819
60820 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
60821 if (VT == MVT::f16)
60822 Res.second = &X86::FR16XRegClass;
60823 else if (VT == MVT::f32 || VT == MVT::i32)
60824 Res.second = &X86::FR32XRegClass;
60825 else if (VT == MVT::f64 || VT == MVT::i64)
60826 Res.second = &X86::FR64XRegClass;
60827 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
60828 Res.second = &X86::VR128XRegClass;
60829 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
60830 Res.second = &X86::VR256XRegClass;
60831 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
60832 Res.second = &X86::VR512RegClass;
60833 else {
60834 // Type mismatch and not a clobber: Return an error;
60835 Res.first = 0;
60836 Res.second = nullptr;
60837 }
60838 } else if (isVKClass(*Class)) {
60839 if (VT == MVT::v1i1 || VT == MVT::i1)
60840 Res.second = &X86::VK1RegClass;
60841 else if (VT == MVT::v8i1 || VT == MVT::i8)
60842 Res.second = &X86::VK8RegClass;
60843 else if (VT == MVT::v16i1 || VT == MVT::i16)
60844 Res.second = &X86::VK16RegClass;
60845 else if (VT == MVT::v32i1 || VT == MVT::i32)
60846 Res.second = &X86::VK32RegClass;
60847 else if (VT == MVT::v64i1 || VT == MVT::i64)
60848 Res.second = &X86::VK64RegClass;
60849 else {
60850 // Type mismatch and not a clobber: Return an error;
60851 Res.first = 0;
60852 Res.second = nullptr;
60853 }
60854 }
60855
60856 return Res;
60857}
60858
60860 // Integer division on x86 is expensive. However, when aggressively optimizing
60861 // for code size, we prefer to use a div instruction, as it is usually smaller
60862 // than the alternative sequence.
60863 // The exception to this is vector division. Since x86 doesn't have vector
60864 // integer division, leaving the division as-is is a loss even in terms of
60865 // size, because it will have to be scalarized, while the alternative code
60866 // sequence can be performed in vector form.
60867 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
60868 return OptSize && !VT.isVector();
60869}
60870
60871void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
60872 if (!Subtarget.is64Bit())
60873 return;
60874
60875 // Update IsSplitCSR in X86MachineFunctionInfo.
60877 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
60878 AFI->setIsSplitCSR(true);
60879}
60880
60881void X86TargetLowering::insertCopiesSplitCSR(
60882 MachineBasicBlock *Entry,
60883 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
60884 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
60885 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
60886 if (!IStart)
60887 return;
60888
60889 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
60890 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
60891 MachineBasicBlock::iterator MBBI = Entry->begin();
60892 for (const MCPhysReg *I = IStart; *I; ++I) {
60893 const TargetRegisterClass *RC = nullptr;
60894 if (X86::GR64RegClass.contains(*I))
60895 RC = &X86::GR64RegClass;
60896 else
60897 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
60898
60899 Register NewVR = MRI->createVirtualRegister(RC);
60900 // Create copy from CSR to a virtual register.
60901 // FIXME: this currently does not emit CFI pseudo-instructions, it works
60902 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
60903 // nounwind. If we want to generalize this later, we may need to emit
60904 // CFI pseudo-instructions.
60905 assert(
60906 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
60907 "Function should be nounwind in insertCopiesSplitCSR!");
60908 Entry->addLiveIn(*I);
60909 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
60910 .addReg(*I);
60911
60912 // Insert the copy-back instructions right before the terminator.
60913 for (auto *Exit : Exits)
60914 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
60915 TII->get(TargetOpcode::COPY), *I)
60916 .addReg(NewVR);
60917 }
60918}
60919
60921 return Subtarget.is64Bit();
60922}
60923
60927 const TargetInstrInfo *TII) const {
60928 assert(MBBI->isCall() && MBBI->getCFIType() &&
60929 "Invalid call instruction for a KCFI check");
60930
60931 MachineFunction &MF = *MBB.getParent();
60932 // If the call target is a memory operand, unfold it and use R11 for the
60933 // call, so KCFI_CHECK won't have to recompute the address.
60934 switch (MBBI->getOpcode()) {
60935 case X86::CALL64m:
60936 case X86::CALL64m_NT:
60937 case X86::TAILJMPm64:
60938 case X86::TAILJMPm64_REX: {
60941 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
60942 /*UnfoldStore=*/false, NewMIs))
60943 report_fatal_error("Failed to unfold memory operand for a KCFI check");
60944 for (auto *NewMI : NewMIs)
60945 MBBI = MBB.insert(OrigCall, NewMI);
60946 assert(MBBI->isCall() &&
60947 "Unexpected instruction after memory operand unfolding");
60948 if (OrigCall->shouldUpdateAdditionalCallInfo())
60949 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
60950 MBBI->setCFIType(MF, OrigCall->getCFIType());
60951 OrigCall->eraseFromParent();
60952 break;
60953 }
60954 default:
60955 break;
60956 }
60957
60958 MachineOperand &Target = MBBI->getOperand(0);
60959 Register TargetReg;
60960 switch (MBBI->getOpcode()) {
60961 case X86::CALL64r:
60962 case X86::CALL64r_NT:
60963 case X86::TAILJMPr64:
60964 case X86::TAILJMPr64_REX:
60965 assert(Target.isReg() && "Unexpected target operand for an indirect call");
60966 Target.setIsRenamable(false);
60967 TargetReg = Target.getReg();
60968 break;
60969 case X86::CALL64pcrel32:
60970 case X86::TAILJMPd64:
60971 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
60972 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
60973 // 64-bit indirect thunk calls.
60974 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
60975 "Unexpected register for an indirect thunk call");
60976 TargetReg = X86::R11;
60977 break;
60978 default:
60979 llvm_unreachable("Unexpected CFI call opcode");
60980 break;
60981 }
60982
60983 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
60984 .addReg(TargetReg)
60985 .addImm(MBBI->getCFIType())
60986 .getInstr();
60987}
60988
60989/// Returns true if stack probing through a function call is requested.
60991 return !getStackProbeSymbolName(MF).empty();
60992}
60993
60994/// Returns true if stack probing through inline assembly is requested.
60996
60997 // No inline stack probe for Windows, they have their own mechanism.
60998 if (Subtarget.isOSWindows() ||
60999 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61000 return false;
61001
61002 // If the function specifically requests inline stack probes, emit them.
61003 if (MF.getFunction().hasFnAttribute("probe-stack"))
61004 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
61005 "inline-asm";
61006
61007 return false;
61008}
61009
61010/// Returns the name of the symbol used to emit stack probes or the empty
61011/// string if not applicable.
61014 // Inline Stack probes disable stack probe call
61015 if (hasInlineStackProbe(MF))
61016 return "";
61017
61018 // If the function specifically requests stack probes, emit them.
61019 if (MF.getFunction().hasFnAttribute("probe-stack"))
61020 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
61021
61022 // Generally, if we aren't on Windows, the platform ABI does not include
61023 // support for stack probes, so don't emit them.
61024 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
61025 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
61026 return "";
61027
61028 // We need a stack probe to conform to the Windows ABI. Choose the right
61029 // symbol.
61030 if (Subtarget.is64Bit())
61031 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
61032 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
61033}
61034
61035unsigned
61037 // The default stack probe size is 4096 if the function has no stackprobesize
61038 // attribute.
61039 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
61040 4096);
61041}
61042
61044 if (ML && ML->isInnermost() &&
61045 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
61048}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
static const LLT S1
static const LLT F64
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:282
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
Live Register Matrix
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr Register SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, bool HasVariableMask, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5463
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5488
void clearSign()
Definition: APFloat.h:1300
opStatus next(bool nextDown)
Definition: APFloat.h:1256
void changeSign()
Definition: APFloat.h:1299
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:1081
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1386
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:493
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1007
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1492
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:910
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
APInt abs() const
Get the absolute value.
Definition: APInt.h:1773
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:466
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1079
int32_t exactLogBase2() const
Definition: APInt.h:1761
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1607
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1577
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:624
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1511
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1434
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1594
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:370
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1417
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:471
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:959
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1389
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:455
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:399
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:947
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:827
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
Value * getPointerOperand()
Definition: Instructions.h:870
BinOp getOperation() const
Definition: Instructions.h:805
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:861
Value * getValOperand()
Definition: Instructions.h:874
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:847
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:893
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1334
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:3007
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1597
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:403
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:435
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Tagged union holding either a T or a Error.
Definition: Error.h:481
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:128
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:707
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:704
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:905
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1048
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:568
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:405
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:272
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
Class to represent integer types.
Definition: DerivedTypes.h:42
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:74
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:176
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:661
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:241
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:246
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:307
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:70
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:586
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:354
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:115
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:371
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:953
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:983
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:458
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:761
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:505
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:797
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
std::optional< uint64_t > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
Definition: SelectionDAG.h:907
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:937
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
void reserve(size_type NumEntries)
Definition: SmallPtrSet.h:112
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
iterator erase(const_iterator CI)
Definition: SmallVector.h:737
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:578
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:286
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:571
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
size_t size_type
Definition: StringRef.h:57
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
static constexpr size_t npos
Definition: StringRef.h:53
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:176
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:253
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:695
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition: Triple.h:752
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:585
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:411
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
uint64_t getArrayNumElements() const
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:31
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:236
bool hasAnyFMA() const
Definition: X86Subtarget.h:203
bool isOSWindows() const
Definition: X86Subtarget.h:327
bool isTargetMachO() const
Definition: X86Subtarget.h:293
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:221
bool hasSSE1() const
Definition: X86Subtarget.h:193
bool hasBitScanPassThrough() const
Definition: X86Subtarget.h:269
bool isPICStyleGOT() const
Definition: X86Subtarget.h:333
bool hasSSE42() const
Definition: X86Subtarget.h:198
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:118
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:281
bool canUseCMOV() const
Definition: X86Subtarget.h:192
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:336
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:305
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:185
bool isTargetDarwin() const
Definition: X86Subtarget.h:285
bool isTargetWin64() const
Definition: X86Subtarget.h:329
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:178
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:283
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:122
bool useAVX512Regs() const
Definition: X86Subtarget.h:253
bool hasSSE3() const
Definition: X86Subtarget.h:195
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:342
bool hasAVX512() const
Definition: X86Subtarget.h:201
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:232
bool hasSSE41() const
Definition: X86Subtarget.h:197
bool isTargetELF() const
Definition: X86Subtarget.h:291
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:209
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:186
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:194
bool hasSSSE3() const
Definition: X86Subtarget.h:196
bool hasInt256() const
Definition: X86Subtarget.h:202
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:334
bool isTargetCygMing() const
Definition: X86Subtarget.h:325
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:289
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:132
bool hasAVX() const
Definition: X86Subtarget.h:199
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:317
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:225
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:321
bool isTargetNaCl64() const
Definition: X86Subtarget.h:301
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:124
bool useBWIRegs() const
Definition: X86Subtarget.h:262
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:200
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:753
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:491
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1360
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:153
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:512
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1340
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1226
@ ConstantFP
Definition: ISDOpcodes.h:77
@ STRICT_FATAN2
Definition: ISDOpcodes.h:428
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1342
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ STRICT_FCEIL
Definition: ISDOpcodes.h:441
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1343
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:130
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1073
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ STRICT_FTANH
Definition: ISDOpcodes.h:431
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:157
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1325
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:451
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1299
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1304
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:871
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:492
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:964
@ STRICT_FLOG2
Definition: ISDOpcodes.h:436
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1270
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1494
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1173
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:141
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ STRICT_FASIN
Definition: ISDOpcodes.h:425
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:685
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:465
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1118
@ STRICT_FATAN
Definition: ISDOpcodes.h:427
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:752
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:788
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1341
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1391
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:642
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:445
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:967
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:966
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:450
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:439
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:440
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ STRICT_FSINH
Definition: ISDOpcodes.h:429
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1344
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:120
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1286
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:860
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ STRICT_FROUND
Definition: ISDOpcodes.h:443
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:464
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1372
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:975
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:442
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:444
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:458
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:457
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1078
@ STRICT_FCOSH
Definition: ISDOpcodes.h:430
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:976
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:680
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:435
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:223
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:433
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:669
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:882
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:438
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:147
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1217
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1392
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:437
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1083
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:692
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1276
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ STRICT_FACOS
Definition: ISDOpcodes.h:426
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1686
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false)
Hook for matching ConstantSDNode predicate.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1681
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1498
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1668
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1643
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1590
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1649
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:214
@ FS
Definition: X86.h:211
@ PTR64
Definition: X86.h:215
@ PTR32_SPTR
Definition: X86.h:213
@ GS
Definition: X86.h:210
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:411
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:391
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:488
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:450
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:432
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:456
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:438
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:476
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:403
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:363
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:472
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:460
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:425
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:480
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:444
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:419
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:387
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:48
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:121
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
@ Length
Definition: DWP.cpp:480
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:139
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1565
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:298
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition: STLExtras.h:2055
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1547
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:348
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
@ SM_SentinelUndef
@ SM_SentinelZero
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1978
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1866
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
const char * toString(DWARFSectionKind Kind)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ TRUNCATE2_TO_REG
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1624
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:257
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:302
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:306
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:280
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:259
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:315
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:258
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:256
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:318
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
Definition: KnownBits.cpp:765
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:488
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:178
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:100
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:79
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:234
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:65
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition: KnownBits.h:266
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:153
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:281
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:85
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:43
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:164
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:53
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:73
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition: KnownBits.h:103
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:228
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:217
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:288
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:303
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition: KnownBits.h:172
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:188
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:240
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:97
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:91
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:804
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:526
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:82
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:59
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.