Skip to content

Commit c47701c

Browse files
committed
AMDGPU: Use wider scalar spills for SGPR spilling
Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445
1 parent 28b9668 commit c47701c

File tree

4 files changed

+259
-53
lines changed

4 files changed

+259
-53
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,22 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
513513
}
514514
}
515515

516+
static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
517+
bool Store) {
518+
if (SuperRegSize % 16 == 0) {
519+
return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
520+
AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
521+
}
522+
523+
if (SuperRegSize % 8 == 0) {
524+
return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
525+
AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
526+
}
527+
528+
return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
529+
AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
530+
}
531+
516532
void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
517533
int Index,
518534
RegScavenger *RS) const {
@@ -522,7 +538,6 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
522538
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
523539
const SIInstrInfo *TII = ST.getInstrInfo();
524540

525-
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
526541
unsigned SuperReg = MI->getOperand(0).getReg();
527542
bool IsKill = MI->getOperand(0).isKill();
528543
const DebugLoc &DL = MI->getDebugLoc();
@@ -534,7 +549,6 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
534549

535550
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
536551

537-
const unsigned EltSize = 4;
538552
unsigned OffsetReg = AMDGPU::M0;
539553
unsigned M0CopyReg = AMDGPU::NoRegister;
540554

@@ -546,27 +560,51 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
546560
}
547561
}
548562

563+
unsigned ScalarStoreOp;
564+
unsigned EltSize = 4;
565+
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
566+
if (SpillToSMEM && isSGPRClass(RC)) {
567+
// XXX - if private_element_size is larger than 4 it might be useful to be
568+
// able to spill wider vmem spills.
569+
std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
570+
}
571+
572+
const TargetRegisterClass *SubRC = nullptr;
573+
unsigned NumSubRegs = 1;
574+
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
575+
576+
if (!SplitParts.empty()) {
577+
NumSubRegs = SplitParts.size();
578+
SubRC = getSubRegClass(RC, SplitParts[0]);
579+
}
580+
549581
// SubReg carries the "Kill" flag when SubReg == SuperReg.
550582
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
551583
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
552584
unsigned SubReg = NumSubRegs == 1 ?
553-
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
585+
SuperReg : getSubReg(SuperReg, SplitParts[i]);
554586

555587
if (SpillToSMEM) {
556588
int64_t FrOffset = FrameInfo.getObjectOffset(Index);
589+
590+
// The allocated memory size is really the wavefront size * the frame
591+
// index size. The widest register class is 64 bytes, so a 4-byte scratch
592+
// allocation is enough to spill this in a single stack object.
593+
//
594+
// FIXME: Frame size/offsets are computed earlier than this, so the extra
595+
// space is still unnecessarily allocated.
596+
557597
unsigned Align = FrameInfo.getObjectAlignment(Index);
558598
MachinePointerInfo PtrInfo
559599
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
560600
MachineMemOperand *MMO
561601
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
562602
EltSize, MinAlign(Align, EltSize * i));
563603

564-
// Add i * 4 wave offset.
565-
//
566604
// SMEM instructions only support a single offset, so increment the wave
567605
// offset.
568606

569-
int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
607+
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
570608
if (Offset != 0) {
571609
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
572610
.addReg(MFI->getScratchWaveOffsetReg())
@@ -576,7 +614,7 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
576614
.addReg(MFI->getScratchWaveOffsetReg());
577615
}
578616

579-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR))
617+
BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
580618
.addReg(SubReg, getKillRegState(IsKill)) // sdata
581619
.addReg(MFI->getScratchRSrcReg()) // sbase
582620
.addReg(OffsetReg, RegState::Kill) // soff
@@ -656,7 +694,6 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
656694
const SIInstrInfo *TII = ST.getInstrInfo();
657695
const DebugLoc &DL = MI->getDebugLoc();
658696

659-
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
660697
unsigned SuperReg = MI->getOperand(0).getReg();
661698
bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
662699

@@ -673,16 +710,34 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
673710
}
674711
}
675712

713+
unsigned EltSize = 4;
714+
unsigned ScalarLoadOp;
715+
716+
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
717+
if (SpillToSMEM && isSGPRClass(RC)) {
718+
// XXX - if private_element_size is larger than 4 it might be useful to be
719+
// able to spill wider vmem spills.
720+
std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
721+
}
722+
723+
const TargetRegisterClass *SubRC = nullptr;
724+
unsigned NumSubRegs = 1;
725+
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
726+
727+
if (!SplitParts.empty()) {
728+
NumSubRegs = SplitParts.size();
729+
SubRC = getSubRegClass(RC, SplitParts[0]);
730+
}
731+
676732
// SubReg carries the "Kill" flag when SubReg == SuperReg.
677733
int64_t FrOffset = FrameInfo.getObjectOffset(Index);
678734

679-
const unsigned EltSize = 4;
680-
681735
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
682736
unsigned SubReg = NumSubRegs == 1 ?
683-
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
737+
SuperReg : getSubReg(SuperReg, SplitParts[i]);
684738

685739
if (SpillToSMEM) {
740+
// FIXME: Size may be > 4 but extra bytes wasted.
686741
unsigned Align = FrameInfo.getObjectAlignment(Index);
687742
MachinePointerInfo PtrInfo
688743
= MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
@@ -691,7 +746,7 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
691746
EltSize, MinAlign(Align, EltSize * i));
692747

693748
// Add i * 4 offset
694-
int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
749+
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
695750
if (Offset != 0) {
696751
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
697752
.addReg(MFI->getScratchWaveOffsetReg())
@@ -702,14 +757,14 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
702757
}
703758

704759
auto MIB =
705-
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg)
760+
BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
706761
.addReg(MFI->getScratchRSrcReg()) // sbase
707762
.addReg(OffsetReg, RegState::Kill) // soff
708763
.addImm(0) // glc
709764
.addMemOperand(MMO);
710765

711766
if (NumSubRegs > 1)
712-
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
767+
MIB.addReg(SuperReg, RegState::ImplicitDefine);
713768

714769
continue;
715770
}
@@ -725,7 +780,7 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
725780
.addImm(Spill.Lane);
726781

727782
if (NumSubRegs > 1)
728-
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
783+
MIB.addReg(SuperReg, RegState::ImplicitDefine);
729784
} else {
730785
// Restore SGPR from a stack slot.
731786
// FIXME: We should use S_LOAD_DWORD here for VI.

llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,31 +13,15 @@
1313
; SGPR-NEXT: s_nop 4
1414
; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
1515

16-
1716
; Make sure scratch wave offset register is correctly incremented and
1817
; then restored.
1918
; SMEM: s_mov_b32 m0, s91{{$}}
20-
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
21-
; SMEM: s_add_u32 m0, s91, 0x100{{$}}
22-
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
23-
; SMEM: s_add_u32 m0, s91, 0x200{{$}}
24-
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
25-
; SMEM: s_add_u32 m0, s91, 0x300{{$}}
26-
; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
27-
19+
; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[92:95], m0 ; 16-byte Folded Spill
2820

2921
; SMEM: s_mov_b32 m0, s91{{$}}
30-
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
31-
; SMEM: s_add_u32 m0, s91, 0x100{{$}}
32-
; SMEM: s_waitcnt lgkmcnt(0)
33-
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
34-
; SMEM: s_add_u32 m0, s91, 0x200{{$}}
35-
; SMEM: s_waitcnt lgkmcnt(0)
36-
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
37-
; SMEM: s_add_u32 m0, s91, 0x300{{$}}
38-
; SMEM: s_waitcnt lgkmcnt(0)
39-
; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
22+
; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[92:95], m0 ; 16-byte Folded Reload
4023

24+
; SMEM: s_dcache_wb
4125
; ALL: s_endpgm
4226
define void @test(i32 addrspace(1)* %out, i32 %in) {
4327
call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()

llvm/test/CodeGen/AMDGPU/spill-m0.ll

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,7 @@ endif:
7272

7373
; TOSMEM-NOT: m0
7474
; TOSMEM: s_add_u32 m0, s7, 0x100
75-
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
76-
; TOSMEM: s_add_u32 m0, s7, 0x200
77-
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
75+
; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
7876
; TOSMEM-NOT: m0
7977

8078
; TOSMEM: s_mov_b64 exec,
@@ -83,7 +81,7 @@ endif:
8381

8482
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
8583
; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100
86-
; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
84+
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
8785

8886

8987
; GCN-NOT: v_readlane_b32 m0
@@ -124,9 +122,7 @@ endif: ; preds = %else, %if
124122

125123
; TOSMEM: s_mov_b32 vcc_hi, m0
126124
; TOSMEM: s_mov_b32 m0, s3
127-
; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
128-
; TOSMEM: s_add_u32 m0, s3, 0x100
129-
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
125+
; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
130126
; TOSMEM: s_mov_b32 m0, vcc_hi
131127

132128
; TOSMEM: s_mov_b64 exec,
@@ -135,13 +131,7 @@ endif: ; preds = %else, %if
135131

136132
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
137133
; TOSMEM-NEXT: s_mov_b32 m0, s3
138-
; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
139-
; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
140-
141-
; FIXME: Could delay this wait
142-
; TOSMEM-NEXT: s_waitcnt lgkmcnt(0)
143-
; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
144-
134+
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
145135

146136
; GCN-NOT: v_readlane_b32 m0
147137
; GCN-NOT: s_buffer_store_dword m0
@@ -167,21 +157,22 @@ endif:
167157
}
168158

169159
; GCN-LABEL: {{^}}restore_m0_lds:
160+
; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
170161
; TOSMEM: s_cmp_eq_u32
171162
; TOSMEM-NOT: m0
172163
; TOSMEM: s_mov_b32 m0, s3
173-
; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill
164+
; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[84:87], m0 ; 8-byte Folded Spill
165+
; TOSMEM-NOT: m0
166+
; TOSMEM: s_add_u32 m0, s3, 0x200
167+
; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[84:87], m0 ; 4-byte Folded Spill
174168
; TOSMEM-NOT: m0
175169
; TOSMEM: s_cbranch_scc1
176170

177171
; TOSMEM: s_mov_b32 m0, -1
178172

179173
; TOSMEM: s_mov_b32 vcc_hi, m0
180174
; TOSMEM: s_mov_b32 m0, s3
181-
; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 4-byte Folded Reload
182-
; TOSMEM: s_add_u32 m0, s3, 0x100
183-
; TOSMEM: s_waitcnt lgkmcnt(0)
184-
; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 4-byte Folded Reload
175+
; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[84:87], m0 ; 8-byte Folded Reload
185176
; TOSMEM: s_mov_b32 m0, vcc_hi
186177
; TOSMEM: s_waitcnt lgkmcnt(0)
187178

0 commit comments

Comments
 (0)