@@ -513,6 +513,22 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
513
513
}
514
514
}
515
515
516
+ static std::pair<unsigned , unsigned > getSpillEltSize (unsigned SuperRegSize,
517
+ bool Store) {
518
+ if (SuperRegSize % 16 == 0 ) {
519
+ return { 16 , Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
520
+ AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
521
+ }
522
+
523
+ if (SuperRegSize % 8 == 0 ) {
524
+ return { 8 , Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
525
+ AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
526
+ }
527
+
528
+ return { 4 , Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
529
+ AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
530
+ }
531
+
516
532
void SIRegisterInfo::spillSGPR (MachineBasicBlock::iterator MI,
517
533
int Index,
518
534
RegScavenger *RS) const {
@@ -522,7 +538,6 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
522
538
const SISubtarget &ST = MF->getSubtarget <SISubtarget>();
523
539
const SIInstrInfo *TII = ST.getInstrInfo ();
524
540
525
- unsigned NumSubRegs = getNumSubRegsForSpillOp (MI->getOpcode ());
526
541
unsigned SuperReg = MI->getOperand (0 ).getReg ();
527
542
bool IsKill = MI->getOperand (0 ).isKill ();
528
543
const DebugLoc &DL = MI->getDebugLoc ();
@@ -534,7 +549,6 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
534
549
535
550
assert (SuperReg != AMDGPU::M0 && " m0 should never spill" );
536
551
537
- const unsigned EltSize = 4 ;
538
552
unsigned OffsetReg = AMDGPU::M0;
539
553
unsigned M0CopyReg = AMDGPU::NoRegister;
540
554
@@ -546,27 +560,51 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
546
560
}
547
561
}
548
562
563
+ unsigned ScalarStoreOp;
564
+ unsigned EltSize = 4 ;
565
+ const TargetRegisterClass *RC = getPhysRegClass (SuperReg);
566
+ if (SpillToSMEM && isSGPRClass (RC)) {
567
+ // XXX - if private_element_size is larger than 4 it might be useful to be
568
+ // able to spill wider vmem spills.
569
+ std::tie (EltSize, ScalarStoreOp) = getSpillEltSize (RC->getSize (), true );
570
+ }
571
+
572
+ const TargetRegisterClass *SubRC = nullptr ;
573
+ unsigned NumSubRegs = 1 ;
574
+ ArrayRef<int16_t > SplitParts = getRegSplitParts (RC, EltSize);
575
+
576
+ if (!SplitParts.empty ()) {
577
+ NumSubRegs = SplitParts.size ();
578
+ SubRC = getSubRegClass (RC, SplitParts[0 ]);
579
+ }
580
+
549
581
// SubReg carries the "Kill" flag when SubReg == SuperReg.
550
582
unsigned SubKillState = getKillRegState ((NumSubRegs == 1 ) && IsKill);
551
583
for (unsigned i = 0 , e = NumSubRegs; i < e; ++i) {
552
584
unsigned SubReg = NumSubRegs == 1 ?
553
- SuperReg : getSubReg (SuperReg, getSubRegFromChannel (i) );
585
+ SuperReg : getSubReg (SuperReg, SplitParts[i] );
554
586
555
587
if (SpillToSMEM) {
556
588
int64_t FrOffset = FrameInfo.getObjectOffset (Index);
589
+
590
+ // The allocated memory size is really the wavefront size * the frame
591
+ // index size. The widest register class is 64 bytes, so a 4-byte scratch
592
+ // allocation is enough to spill this in a single stack object.
593
+ //
594
+ // FIXME: Frame size/offsets are computed earlier than this, so the extra
595
+ // space is still unnecessarily allocated.
596
+
557
597
unsigned Align = FrameInfo.getObjectAlignment (Index);
558
598
MachinePointerInfo PtrInfo
559
599
= MachinePointerInfo::getFixedStack (*MF, Index, EltSize * i);
560
600
MachineMemOperand *MMO
561
601
= MF->getMachineMemOperand (PtrInfo, MachineMemOperand::MOStore,
562
602
EltSize, MinAlign (Align, EltSize * i));
563
603
564
- // Add i * 4 wave offset.
565
- //
566
604
// SMEM instructions only support a single offset, so increment the wave
567
605
// offset.
568
606
569
- int64_t Offset = ST.getWavefrontSize () * ( FrOffset + 4 * i);
607
+ int64_t Offset = ( ST.getWavefrontSize () * FrOffset) + (EltSize * i);
570
608
if (Offset != 0 ) {
571
609
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_U32), OffsetReg)
572
610
.addReg (MFI->getScratchWaveOffsetReg ())
@@ -576,7 +614,7 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
576
614
.addReg (MFI->getScratchWaveOffsetReg ());
577
615
}
578
616
579
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_BUFFER_STORE_DWORD_SGPR ))
617
+ BuildMI (*MBB, MI, DL, TII->get (ScalarStoreOp ))
580
618
.addReg (SubReg, getKillRegState (IsKill)) // sdata
581
619
.addReg (MFI->getScratchRSrcReg ()) // sbase
582
620
.addReg (OffsetReg, RegState::Kill) // soff
@@ -656,7 +694,6 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
656
694
const SIInstrInfo *TII = ST.getInstrInfo ();
657
695
const DebugLoc &DL = MI->getDebugLoc ();
658
696
659
- unsigned NumSubRegs = getNumSubRegsForSpillOp (MI->getOpcode ());
660
697
unsigned SuperReg = MI->getOperand (0 ).getReg ();
661
698
bool SpillToSMEM = ST.hasScalarStores () && EnableSpillSGPRToSMEM;
662
699
@@ -673,16 +710,34 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
673
710
}
674
711
}
675
712
713
+ unsigned EltSize = 4 ;
714
+ unsigned ScalarLoadOp;
715
+
716
+ const TargetRegisterClass *RC = getPhysRegClass (SuperReg);
717
+ if (SpillToSMEM && isSGPRClass (RC)) {
718
+ // XXX - if private_element_size is larger than 4 it might be useful to be
719
+ // able to spill wider vmem spills.
720
+ std::tie (EltSize, ScalarLoadOp) = getSpillEltSize (RC->getSize (), false );
721
+ }
722
+
723
+ const TargetRegisterClass *SubRC = nullptr ;
724
+ unsigned NumSubRegs = 1 ;
725
+ ArrayRef<int16_t > SplitParts = getRegSplitParts (RC, EltSize);
726
+
727
+ if (!SplitParts.empty ()) {
728
+ NumSubRegs = SplitParts.size ();
729
+ SubRC = getSubRegClass (RC, SplitParts[0 ]);
730
+ }
731
+
676
732
// SubReg carries the "Kill" flag when SubReg == SuperReg.
677
733
int64_t FrOffset = FrameInfo.getObjectOffset (Index);
678
734
679
- const unsigned EltSize = 4 ;
680
-
681
735
for (unsigned i = 0 , e = NumSubRegs; i < e; ++i) {
682
736
unsigned SubReg = NumSubRegs == 1 ?
683
- SuperReg : getSubReg (SuperReg, getSubRegFromChannel (i) );
737
+ SuperReg : getSubReg (SuperReg, SplitParts[i] );
684
738
685
739
if (SpillToSMEM) {
740
+ // FIXME: Size may be > 4 but extra bytes wasted.
686
741
unsigned Align = FrameInfo.getObjectAlignment (Index);
687
742
MachinePointerInfo PtrInfo
688
743
= MachinePointerInfo::getFixedStack (*MF, Index, EltSize * i);
@@ -691,7 +746,7 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
691
746
EltSize, MinAlign (Align, EltSize * i));
692
747
693
748
// Add i * 4 offset
694
- int64_t Offset = ST.getWavefrontSize () * ( FrOffset + 4 * i);
749
+ int64_t Offset = ( ST.getWavefrontSize () * FrOffset) + (EltSize * i);
695
750
if (Offset != 0 ) {
696
751
BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_ADD_U32), OffsetReg)
697
752
.addReg (MFI->getScratchWaveOffsetReg ())
@@ -702,14 +757,14 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
702
757
}
703
758
704
759
auto MIB =
705
- BuildMI (*MBB, MI, DL, TII->get (AMDGPU::S_BUFFER_LOAD_DWORD_SGPR ), SubReg)
760
+ BuildMI (*MBB, MI, DL, TII->get (ScalarLoadOp ), SubReg)
706
761
.addReg (MFI->getScratchRSrcReg ()) // sbase
707
762
.addReg (OffsetReg, RegState::Kill) // soff
708
763
.addImm (0 ) // glc
709
764
.addMemOperand (MMO);
710
765
711
766
if (NumSubRegs > 1 )
712
- MIB.addReg (MI-> getOperand ( 0 ). getReg () , RegState::ImplicitDefine);
767
+ MIB.addReg (SuperReg , RegState::ImplicitDefine);
713
768
714
769
continue ;
715
770
}
@@ -725,7 +780,7 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
725
780
.addImm (Spill.Lane );
726
781
727
782
if (NumSubRegs > 1 )
728
- MIB.addReg (MI-> getOperand ( 0 ). getReg () , RegState::ImplicitDefine);
783
+ MIB.addReg (SuperReg , RegState::ImplicitDefine);
729
784
} else {
730
785
// Restore SGPR from a stack slot.
731
786
// FIXME: We should use S_LOAD_DWORD here for VI.
0 commit comments