Skip to content

Commit e95f6fa

Browse files
authored
RegisterCoalescer: Enable terminal rule by default for AMDGPU (#161621)
Introduce a target hook to incrementally flip the behavior of targets with test changes, and start by implementing it for AMDGPU. This appears to be forgotten switch flip from 2015. This seems to do a nicer job with subregister copies. Most of the test changes are improvements or neutral, not that many are light regressions. The worst AMDGPU regressions are for true16 in the atomic tests, but I think that's due to existing true16 issues.
1 parent 6bad2d1 commit e95f6fa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+5735
-5730
lines changed

llvm/include/llvm/CodeGen/TargetSubtargetInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,10 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo {
210210
/// can be overridden.
211211
virtual bool enableJoinGlobalCopies() const;
212212

213+
/// Hack to bring up option. This should be unconditionally true, all targets
214+
/// should enable it and delete this.
215+
virtual bool enableTerminalRule() const { return false; }
216+
213217
/// True if the subtarget should run a scheduler after register allocation.
214218
///
215219
/// By default this queries the PostRAScheduling bit in the scheduling model

llvm/lib/CodeGen/RegisterCoalescer.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,9 @@ static cl::opt<bool> EnableJoining("join-liveintervals",
7979
cl::desc("Coalesce copies (default=true)"),
8080
cl::init(true), cl::Hidden);
8181

82-
static cl::opt<bool> UseTerminalRule("terminal-rule",
83-
cl::desc("Apply the terminal rule"),
84-
cl::init(false), cl::Hidden);
82+
static cl::opt<cl::boolOrDefault>
83+
EnableTerminalRule("terminal-rule", cl::desc("Apply the terminal rule"),
84+
cl::init(cl::BOU_UNSET), cl::Hidden);
8585

8686
/// Temporary flag to test critical edge unsplitting.
8787
static cl::opt<bool> EnableJoinSplits(
@@ -134,6 +134,7 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
134134
SlotIndexes *SI = nullptr;
135135
const MachineLoopInfo *Loops = nullptr;
136136
RegisterClassInfo RegClassInfo;
137+
bool UseTerminalRule = false;
137138

138139
/// Position and VReg of a PHI instruction during coalescing.
139140
struct PHIValPos {
@@ -4320,6 +4321,11 @@ bool RegisterCoalescer::run(MachineFunction &fn) {
43204321
else
43214322
JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
43224323

4324+
if (EnableTerminalRule == cl::BOU_UNSET)
4325+
UseTerminalRule = STI.enableTerminalRule();
4326+
else
4327+
UseTerminalRule = EnableTerminalRule == cl::BOU_TRUE;
4328+
43234329
// If there are PHIs tracked by debug-info, they will need updating during
43244330
// coalescing. Build an index of those PHIs to ease updating.
43254331
SlotIndexes *Slots = LIS->getSlotIndexes();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
10401040
return true;
10411041
}
10421042

1043+
bool enableTerminalRule() const override { return true; }
1044+
10431045
bool useAA() const override;
10441046

10451047
bool enableSubRegLiveness() const override {

llvm/lib/Target/AMDGPU/R600Subtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ class R600Subtarget final : public R600GenSubtargetInfo,
126126
return true;
127127
}
128128

129+
bool enableTerminalRule() const override { return true; }
130+
129131
bool enableSubRegLiveness() const override {
130132
return true;
131133
}

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
2121
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2222
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
2323
; GFX10-NEXT: s_mov_b32 s8, exec_lo
24+
; GFX10-NEXT: s_mov_b32 s9, s5
2425
; GFX10-NEXT: s_add_i32 s6, s6, 1
25-
; GFX10-NEXT: s_xor_b32 s8, s5, s8
26+
; GFX10-NEXT: s_xor_b32 s5, s5, s8
2627
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
2728
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
2829
; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
29-
; GFX10-NEXT: s_and_b32 s9, exec_lo, s5
30-
; GFX10-NEXT: s_mov_b32 s5, s8
31-
; GFX10-NEXT: s_or_b32 s7, s7, s9
30+
; GFX10-NEXT: s_and_b32 s8, exec_lo, s9
31+
; GFX10-NEXT: s_or_b32 s7, s7, s8
3232
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
3333
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
3434
; GFX10-NEXT: ; %bb.2: ; %exit
@@ -240,11 +240,11 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
240240
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
241241
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
242242
; GFX10-NEXT: s_mov_b32 s6, exec_lo
243-
; GFX10-NEXT: s_mov_b32 s8, 0
243+
; GFX10-NEXT: s_mov_b32 s4, 0
244244
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
245245
; GFX10-NEXT: s_cbranch_execz .LBB4_6
246246
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
247-
; GFX10-NEXT: s_mov_b32 s4, 0
247+
; GFX10-NEXT: s_mov_b32 s8, 0
248248
; GFX10-NEXT: ; implicit-def: $sgpr10
249249
; GFX10-NEXT: ; implicit-def: $sgpr11
250250
; GFX10-NEXT: ; implicit-def: $sgpr9
@@ -345,8 +345,8 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
345345
; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
346346
; GFX10: ; %bb.0: ; %entry
347347
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348-
; GFX10-NEXT: s_mov_b32 s6, 0
349348
; GFX10-NEXT: s_mov_b32 s4, 0
349+
; GFX10-NEXT: s_mov_b32 s6, 0
350350
; GFX10-NEXT: ; implicit-def: $sgpr7
351351
; GFX10-NEXT: s_branch .LBB5_2
352352
; GFX10-NEXT: .LBB5_1: ; %Flow
@@ -457,8 +457,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
457457
; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
458458
; GFX10: ; %bb.0: ; %entry
459459
; GFX10-NEXT: s_mov_b32 s1, exec_lo
460-
; GFX10-NEXT: s_mov_b32 s2, 0
461460
; GFX10-NEXT: s_mov_b32 s0, 0
461+
; GFX10-NEXT: s_mov_b32 s2, 0
462462
; GFX10-NEXT: ; implicit-def: $sgpr4
463463
; GFX10-NEXT: ; implicit-def: $sgpr3
464464
; GFX10-NEXT: s_branch .LBB6_2
@@ -534,8 +534,8 @@ exit:
534534
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
535535
; GFX10-LABEL: loop_with_1break:
536536
; GFX10: ; %bb.0: ; %entry
537-
; GFX10-NEXT: s_mov_b32 s4, 0
538537
; GFX10-NEXT: s_mov_b32 s0, 0
538+
; GFX10-NEXT: s_mov_b32 s4, 0
539539
; GFX10-NEXT: ; implicit-def: $sgpr6
540540
; GFX10-NEXT: ; implicit-def: $sgpr7
541541
; GFX10-NEXT: ; implicit-def: $sgpr5

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,8 @@ exit:
106106
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
107107
; GFX10-LABEL: loop_with_1break:
108108
; GFX10: ; %bb.0: ; %entry
109-
; GFX10-NEXT: s_mov_b32 s4, 0
110109
; GFX10-NEXT: s_mov_b32 s0, 0
110+
; GFX10-NEXT: s_mov_b32 s4, 0
111111
; GFX10-NEXT: ; implicit-def: $sgpr5
112112
; GFX10-NEXT: s_branch .LBB2_2
113113
; GFX10-NEXT: .LBB2_1: ; %Flow
@@ -180,8 +180,8 @@ exit:
180180
define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
181181
; GFX10-LABEL: loop_with_2breaks:
182182
; GFX10: ; %bb.0: ; %entry
183-
; GFX10-NEXT: s_mov_b32 s4, 0
184183
; GFX10-NEXT: s_mov_b32 s0, 0
184+
; GFX10-NEXT: s_mov_b32 s4, 0
185185
; GFX10-NEXT: ; implicit-def: $sgpr5
186186
; GFX10-NEXT: s_branch .LBB3_3
187187
; GFX10-NEXT: .LBB3_1: ; %Flow3
@@ -278,8 +278,8 @@ exit:
278278
define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
279279
; GFX10-LABEL: loop_with_3breaks:
280280
; GFX10: ; %bb.0: ; %entry
281-
; GFX10-NEXT: s_mov_b32 s4, 0
282281
; GFX10-NEXT: s_mov_b32 s0, 0
282+
; GFX10-NEXT: s_mov_b32 s4, 0
283283
; GFX10-NEXT: ; implicit-def: $sgpr5
284284
; GFX10-NEXT: s_branch .LBB4_4
285285
; GFX10-NEXT: .LBB4_1: ; %Flow5
@@ -404,8 +404,8 @@ exit:
404404
define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
405405
; GFX10-LABEL: loop_with_div_break_with_body:
406406
; GFX10: ; %bb.0: ; %entry
407-
; GFX10-NEXT: s_mov_b32 s4, 0
408407
; GFX10-NEXT: s_mov_b32 s0, 0
408+
; GFX10-NEXT: s_mov_b32 s4, 0
409409
; GFX10-NEXT: ; implicit-def: $sgpr6
410410
; GFX10-NEXT: ; implicit-def: $sgpr7
411411
; GFX10-NEXT: ; implicit-def: $sgpr5

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
101101
; GFX10-LABEL: loop_with_1break:
102102
; GFX10: ; %bb.0: ; %entry
103103
; GFX10-NEXT: v_mov_b32_e32 v3, 0
104-
; GFX10-NEXT: s_mov_b32 s8, 0
105104
; GFX10-NEXT: s_mov_b32 s4, 0
105+
; GFX10-NEXT: s_mov_b32 s8, 0
106106
; GFX10-NEXT: ; implicit-def: $sgpr10
107107
; GFX10-NEXT: ; implicit-def: $sgpr9
108108
; GFX10-NEXT: s_branch .LBB2_3
@@ -197,14 +197,14 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
197197
; GFX10-LABEL: nested_loops_temporal_divergence_inner:
198198
; GFX10: ; %bb.0: ; %entry
199199
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200-
; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
201-
; GFX10-NEXT: s_mov_b32 s5, 0
200+
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
202201
; GFX10-NEXT: s_mov_b32 s6, 0
202+
; GFX10-NEXT: s_mov_b32 s8, 0
203203
; GFX10-NEXT: .LBB3_1: ; %OuterHeader
204204
; GFX10-NEXT: ; =>This Loop Header: Depth=1
205205
; GFX10-NEXT: ; Child Loop BB3_2 Depth 2
206206
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
207-
; GFX10-NEXT: s_mov_b32 s4, s8
207+
; GFX10-NEXT: s_mov_b32 s4, s5
208208
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
209209
; GFX10-NEXT: ; implicit-def: $sgpr9
210210
; GFX10-NEXT: v_mov_b32_e32 v6, s10
@@ -239,13 +239,13 @@ define void @nested_loops_temporal_divergence_inner(float %pre.cond.val, i32 %n.
239239
; GFX10-NEXT: s_add_i32 s6, s6, 1
240240
; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6
241241
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
242-
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
242+
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
243243
; GFX10-NEXT: flat_store_byte v[6:7], v0
244244
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
245-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
245+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
246246
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
247247
; GFX10-NEXT: ; %bb.4: ; %exit
248-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
248+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
249249
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
250250
; GFX10-NEXT: s_setpc_b64 s[30:31]
251251
entry:
@@ -288,14 +288,14 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
288288
; GFX10-LABEL: nested_loops_temporal_divergence_outer:
289289
; GFX10: ; %bb.0: ; %entry
290290
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
291-
; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
292-
; GFX10-NEXT: s_mov_b32 s5, 0
291+
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
293292
; GFX10-NEXT: s_mov_b32 s6, 0
293+
; GFX10-NEXT: s_mov_b32 s8, 0
294294
; GFX10-NEXT: .LBB4_1: ; %OuterHeader
295295
; GFX10-NEXT: ; =>This Loop Header: Depth=1
296296
; GFX10-NEXT: ; Child Loop BB4_2 Depth 2
297297
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
298-
; GFX10-NEXT: s_mov_b32 s4, s8
298+
; GFX10-NEXT: s_mov_b32 s4, s5
299299
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
300300
; GFX10-NEXT: ; implicit-def: $sgpr9
301301
; GFX10-NEXT: v_mov_b32_e32 v6, s10
@@ -330,13 +330,13 @@ define void @nested_loops_temporal_divergence_outer(float %pre.cond.val, i32 %n.
330330
; GFX10-NEXT: s_add_i32 s6, s6, 1
331331
; GFX10-NEXT: v_add_co_u32 v6, s4, v4, v6
332332
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v5, v7, s4
333-
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
333+
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
334334
; GFX10-NEXT: flat_store_byte v[6:7], v0
335335
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
336-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
336+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
337337
; GFX10-NEXT: s_cbranch_execnz .LBB4_1
338338
; GFX10-NEXT: ; %bb.4: ; %exit
339-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
339+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
340340
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
341341
; GFX10-NEXT: s_setpc_b64 s[30:31]
342342
entry:
@@ -379,15 +379,15 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
379379
; GFX10-LABEL: nested_loops_temporal_divergence_both:
380380
; GFX10: ; %bb.0: ; %entry
381381
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382-
; GFX10-NEXT: v_cmp_lt_f32_e64 s8, 1.0, v0
383-
; GFX10-NEXT: s_mov_b32 s5, 0
382+
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v0
384383
; GFX10-NEXT: s_mov_b32 s6, 0
384+
; GFX10-NEXT: s_mov_b32 s8, 0
385385
; GFX10-NEXT: ; implicit-def: $sgpr9
386386
; GFX10-NEXT: .LBB5_1: ; %OuterHeader
387387
; GFX10-NEXT: ; =>This Loop Header: Depth=1
388388
; GFX10-NEXT: ; Child Loop BB5_2 Depth 2
389389
; GFX10-NEXT: s_ashr_i32 s7, s6, 31
390-
; GFX10-NEXT: s_mov_b32 s4, s8
390+
; GFX10-NEXT: s_mov_b32 s4, s5
391391
; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], 2
392392
; GFX10-NEXT: v_mov_b32_e32 v8, s10
393393
; GFX10-NEXT: v_mov_b32_e32 v9, s11
@@ -421,13 +421,13 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i
421421
; GFX10-NEXT: s_add_i32 s6, s6, 1
422422
; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v8
423423
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v9, s4
424-
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
424+
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
425425
; GFX10-NEXT: flat_store_byte v[8:9], v0
426426
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
427-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
427+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
428428
; GFX10-NEXT: s_cbranch_execnz .LBB5_1
429429
; GFX10-NEXT: ; %bb.4: ; %exit
430-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
430+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
431431
; GFX10-NEXT: flat_store_byte v[6:7], v0
432432
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
433433
; GFX10-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
547547
;
548548
; NEW_RBS-LABEL: loop_with_2breaks:
549549
; NEW_RBS: ; %bb.0: ; %entry
550-
; NEW_RBS-NEXT: s_mov_b32 s4, 0
551550
; NEW_RBS-NEXT: s_mov_b32 s0, 0
551+
; NEW_RBS-NEXT: s_mov_b32 s4, 0
552552
; NEW_RBS-NEXT: ; implicit-def: $sgpr5
553553
; NEW_RBS-NEXT: s_branch .LBB16_3
554554
; NEW_RBS-NEXT: .LBB16_1: ; %Flow3

0 commit comments

Comments
 (0)