Skip to content

Commit d0cdfd4

Browse files
committed
Revert: Refactor Pre-RA Scheduling
Refactor Pre-RA Scheduling
1 parent 3c5cdae commit d0cdfd4

File tree

6 files changed

+110
-122
lines changed

6 files changed

+110
-122
lines changed

visa/G4_Kernel.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ class G4_Kernel {
426426
bool m_hasIndirectCall = false;
427427

428428
VarSplitPass *varSplitPass = nullptr;
429+
GRFMode grfMode;
429430

430431
// map key is filename string with complete path.
431432
// if first elem of pair is false, the file wasn't found.
@@ -493,7 +494,6 @@ class G4_Kernel {
493494
unsigned char minor_version;
494495

495496
StackCallABI stackCall;
496-
GRFMode grfMode;
497497

498498
G4_Kernel(const PlatformInfo &pInfo, INST_LIST_NODE_ALLOCATOR &alloc,
499499
Mem_Manager &m, Options *options, Attributes *anAttr,

visa/LocalScheduler/G4_Sched.cpp

Lines changed: 89 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ static const unsigned LARGE_BLOCK_SIZE = 20000;
3131
static const unsigned LARGE_BLOCK_SIZE_RPE = 32000;
3232
static const unsigned PRESSURE_REDUCTION_MIN_BENEFIT = 5; // percentage
3333
static const unsigned PRESSURE_REDUCTION_THRESHOLD = 110;
34-
static const unsigned PRESSURE_LATENCY_HIDING_THRESHOLD = 104;
3534
static const unsigned PRESSURE_HIGH_THRESHOLD = 128;
3635
static const unsigned PRESSURE_REDUCTION_THRESHOLD_SIMD32 = 120;
3736

@@ -362,16 +361,12 @@ struct RegisterPressure {
362361
}
363362

364363
void recompute(G4_BB *BB) { rpe->runBB(BB); }
365-
void recompute() { rpe->run(); }
366364

367365
// Return the register pressure in GRF for an instruction.
368366
unsigned getPressure(G4_INST *Inst) const {
369367
return rpe->getRegisterPressure(Inst);
370368
}
371369

372-
// Return the max register pressure
373-
unsigned getMaxRP() const { return rpe->getMaxRP(); }
374-
375370
// Return the max pressure in GRFs for this block.
376371
unsigned getPressure(G4_BB *bb, std::vector<G4_INST *> *Insts = nullptr) {
377372
unsigned Max = 0;
@@ -495,7 +490,7 @@ class BB_Scheduler {
495490
// ReassignID of PreNodes when this is not 1st-round scheduling
496491
// KernelRP is the measure max reg-pressure of this kernel before scheduling
497492
bool scheduleBlockForLatency(unsigned &MaxPressure, bool ReassignID,
498-
unsigned UpperBoundGRF);
493+
unsigned KernelRP);
499494

500495
private:
501496
void SethiUllmanScheduling();
@@ -550,29 +545,29 @@ static unsigned getLatencyHidingThreshold(G4_Kernel &kernel, unsigned NumGrfs) {
550545
unsigned RPThreshold =
551546
kernel.getOptions()->getuInt32Option(vISA_preRA_ScheduleRPThreshold);
552547
if (RPThreshold == 0) {
553-
RPThreshold = PRESSURE_LATENCY_HIDING_THRESHOLD;
548+
RPThreshold = 104;
554549
}
555-
return unsigned(RPThreshold * (std::max(NumGrfs, 128u) - 48u) / 80u);
550+
return unsigned(RPThreshold * (std::max(NumGrfs, 128u) - 32u) / 96u);
556551
}
557552

558-
preRA_Scheduler::preRA_Scheduler(G4_Kernel &k)
559-
: kernel(k) {}
553+
preRA_Scheduler::preRA_Scheduler(G4_Kernel &k, RPE *rpe)
554+
: kernel(k), rpe(rpe), m_options(kernel.getOptions()) {}
560555

561556
preRA_Scheduler::~preRA_Scheduler() {}
562557

563-
bool preRA_Scheduler::run(unsigned &KernelPressure) {
558+
bool preRA_Scheduler::run() {
564559
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D) {
565560
// Do not run pre-RA scheduler for CM unless user forces it.
566-
if (!kernel.getOption(vISA_preRA_ScheduleForce))
561+
if (!m_options->getOption(vISA_preRA_ScheduleForce))
567562
return false;
568563
}
569564

570565
unsigned Threshold = getRPReductionThreshold(kernel);
571-
unsigned SchedCtrl = kernel.getuInt32Option(vISA_preRA_ScheduleCtrl);
566+
unsigned SchedCtrl = m_options->getuInt32Option(vISA_preRA_ScheduleCtrl);
572567

573568
auto LT = LatencyTable::createLatencyTable(*kernel.fg.builder);
574569
SchedConfig config(SchedCtrl);
575-
RegisterPressure rp(kernel, nullptr);
570+
RegisterPressure rp(kernel, rpe);
576571
// skip extreme test cases that scheduling does not good
577572
// if (kernel.fg.getNumBB() >= 10000 && rp.rpe->getMaxRP() >= 800)
578573
// return false;
@@ -585,16 +580,16 @@ bool preRA_Scheduler::run(unsigned &KernelPressure) {
585580
continue;
586581
}
587582

588-
if (kernel.getuInt32Option(vISA_ScheduleStartBBID) &&
583+
if (kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID) &&
589584
(bb->getId() <
590-
kernel.getuInt32Option(vISA_ScheduleStartBBID))) {
585+
kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID))) {
591586
SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
592587
continue;
593588
}
594589

595-
if (kernel.getuInt32Option(vISA_ScheduleEndBBID) &&
590+
if (kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID) &&
596591
(bb->getId() >
597-
kernel.getuInt32Option(vISA_ScheduleEndBBID))) {
592+
kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID))) {
598593
SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
599594
continue;
600595
}
@@ -612,118 +607,120 @@ bool preRA_Scheduler::run(unsigned &KernelPressure) {
612607
Changed |= S.scheduleBlockForPressure(MaxPressure, Threshold);
613608
Changed |= S.scheduleBlockForLatency(MaxPressure, Changed, 0);
614609
}
615-
616-
if (Changed)
617-
rp.recompute();
618-
KernelPressure = rp.getMaxRP();
619-
610+
if (kernel.getOptions()->getOption(vISA_PreSchedGRFPressure)) {
611+
rp.rpe->run();
612+
kernel.fg.builder->getJitInfo()->stats.maxGRFPressure = rp.rpe->getMaxRP();
613+
}
620614
return Changed;
621615
}
622616

623-
preRA_RegSharing::preRA_RegSharing(G4_Kernel &k)
624-
: kernel(k) {}
617+
preRA_RegSharing::preRA_RegSharing(G4_Kernel &k, RPE *rpe)
618+
: kernel(k), rpe(rpe) {}
625619

626620
preRA_RegSharing::~preRA_RegSharing() {}
627621

628-
bool preRA_RegSharing::run(unsigned &KernelPressure) {
629-
622+
bool preRA_RegSharing::run() {
623+
// General algorithm:
624+
// 1. Schedule for pressure
625+
// - If RP is low (e.g. < 64, based on platform), set maximum number of
626+
// threads
627+
// 2. Estimate number of threads [4 .. 12] based on initial RP
628+
// 3. Schedule for latency (obtain ILP, stalls, throughput)
629+
// 4. Compute cost of schedule
630+
// 5. Based on schedule cost:
631+
// - Return ok (keep best schedule)
632+
// - Goto 3
630633

631634
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D) {
632635
// Do not run pre-RA scheduler for CM unless user forces it.
633-
if (!kernel.getOption(vISA_preRA_ScheduleForce))
636+
if (!kernel.getOptions()->getOption(vISA_preRA_ScheduleForce))
634637
return false;
635638
}
636639

637-
bool Changed = false;
640+
bool changed = false;
638641

639-
unsigned SchedCtrl = kernel.getuInt32Option(vISA_preRA_ScheduleCtrl);
642+
unsigned SchedCtrl =
643+
kernel.getOptions()->getuInt32Option(vISA_preRA_ScheduleCtrl);
640644
SchedConfig config(SchedCtrl);
641-
RegisterPressure rp(kernel, nullptr);
642-
KernelPressure = rp.getMaxRP();
643-
unsigned RPReductionThreshold = getRPReductionThreshold(kernel);
644-
auto LT = LatencyTable::createLatencyTable(*kernel.fg.builder);
645645

646-
// Schedule for reg pressure reduction if needed
646+
RegisterPressure rp(kernel, rpe);
647+
648+
std::unordered_map<G4_BB *, unsigned int> rpBB;
649+
unsigned KernelPressure = 0;
650+
651+
// Obtain register pressure estimate of every BB
647652
for (auto bb : kernel.fg) {
648-
// Skip BBs:
649-
if (bb->size() < SMALL_BLOCK_SIZE || bb->size() > LARGE_BLOCK_SIZE) {
653+
if (bb->size() < SMALL_BLOCK_SIZE || bb->size() > LARGE_BLOCK_SIZE_RPE) {
650654
SCHED_DUMP(std::cerr << "Skip block with instructions " << bb->size()
651655
<< "\n");
652656
continue;
653657
}
654658

655-
if (kernel.getuInt32Option(vISA_ScheduleStartBBID) &&
656-
(bb->getId() < kernel.getuInt32Option(vISA_ScheduleStartBBID))) {
657-
SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
658-
continue;
659-
}
659+
unsigned pressure = rp.getPressure(bb);
660+
rpBB[bb] = pressure;
660661

661-
if (kernel.getuInt32Option(vISA_ScheduleEndBBID) &&
662-
(bb->getId() > kernel.getuInt32Option(vISA_ScheduleEndBBID))) {
663-
SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
664-
continue;
662+
if (pressure > KernelPressure) {
663+
KernelPressure = pressure;
665664
}
666-
667-
// Schedule:
668-
SCHED_DUMP(rp.dump(bb, "Before scheduling for pressure reduction, "));
669-
preDDD ddd(kernel, bb);
670-
BB_Scheduler S(kernel, ddd, rp, config, *LT);
671-
unsigned BBRP = rp.getPressure(bb);
672-
Changed |= S.scheduleBlockForPressure(BBRP, RPReductionThreshold);
673665
}
674666

675-
if (Changed) {
676-
// Re-compute register pressure estimation
677-
rp.recompute();
678-
KernelPressure = rp.getMaxRP();
667+
// Obs: Heuristic considering PVC with 2 GRF modes as of 03/2020
668+
// If maximum register pressure is higher than default GRF mode,
669+
// assign the smallest number of threads to this kernel.
670+
if (!kernel.getOptions()->getuInt32Option(vISA_HWThreadNumberPerEU) &&
671+
(KernelPressure >
672+
kernel.getScaledGRFSize(PRESSURE_HIGH_THRESHOLD) -
673+
kernel.getOptions()->getuInt32Option(vISA_ReservedGRFNum))) {
674+
// Update number of threads, GRF, Acc and SWSB
675+
kernel.updateKernelToLargerGRF();
679676
}
680677

681-
// Adjust GRF based on register pressure
682-
unsigned oldGRFNum = kernel.getNumRegTotal();
683-
kernel.updateKernelByRegPressure(KernelPressure);
684-
bool GRFdecreased = kernel.getNumRegTotal() < oldGRFNum;
685-
Changed = false;
678+
// skip extreme test cases that scheduling does not good
679+
// if (kernel.fg.getNumBB() >= 10000 && KernelPressure >= 800)
680+
// return false;
681+
682+
unsigned Threshold = getRPReductionThreshold(kernel);
683+
auto LT = LatencyTable::createLatencyTable(*kernel.fg.builder);
686684

687-
// Schedule for latency hiding if needed
688685
for (auto bb : kernel.fg) {
689-
// Skip BBs:
690686
if (bb->size() < SMALL_BLOCK_SIZE || bb->size() > LARGE_BLOCK_SIZE) {
691687
SCHED_DUMP(std::cerr << "Skip block with instructions " << bb->size()
692688
<< "\n");
693689
continue;
694690
}
695691

696-
if (kernel.getuInt32Option(vISA_ScheduleStartBBID) &&
697-
(bb->getId() < kernel.getuInt32Option(vISA_ScheduleStartBBID))) {
692+
if (kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID) &&
693+
(bb->getId() <
694+
kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID))) {
698695
SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
699696
continue;
700697
}
701698

702-
if (kernel.getuInt32Option(vISA_ScheduleEndBBID) &&
703-
(bb->getId() > kernel.getuInt32Option(vISA_ScheduleEndBBID))) {
699+
if (kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID) &&
700+
(bb->getId() >
701+
kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID))) {
704702
SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
705703
continue;
706704
}
707705

708-
// Schedule:
709-
SCHED_DUMP(rp.dump(bb, "Before scheduling for latency hiding, "));
706+
unsigned MaxPressure = rpBB.find(bb) == rpBB.end() ? 0 : rpBB[bb];
707+
if (MaxPressure <= Threshold && !config.UseLatency) {
708+
SCHED_DUMP(std::cerr << "Skip block with rp " << MaxPressure << "\n");
709+
continue;
710+
}
711+
712+
SCHED_DUMP(rp.dump(bb, "Before scheduling, "));
710713
preDDD ddd(kernel, bb);
711714
BB_Scheduler S(kernel, ddd, rp, config, *LT);
712-
unsigned BBRP = rp.getPressure(bb);
713715

714-
unsigned UpperBoundGRF = 0;
715-
if (GRFdecreased && KernelPressure < kernel.grfMode.getMaxGRF())
716-
UpperBoundGRF = kernel.grfMode.getLargerGRF();
717-
Changed |= S.scheduleBlockForLatency(BBRP, Changed, UpperBoundGRF);
716+
changed |= S.scheduleBlockForPressure(MaxPressure, Threshold);
717+
changed |= S.scheduleBlockForLatency(MaxPressure, changed, 0);
718718
}
719-
720-
if (Changed) {
721-
rp.recompute();
722-
KernelPressure = rp.getMaxRP();
719+
if (kernel.getOptions()->getOption(vISA_PreSchedGRFPressure)) {
720+
rp.rpe->run();
721+
kernel.fg.builder->getJitInfo()->stats.maxGRFPressure = rp.rpe->getMaxRP();
723722
}
724-
kernel.updateKernelByRegPressure(KernelPressure);
725-
726-
return Changed;
723+
return changed;
727724
}
728725

729726
bool BB_Scheduler::verifyScheduling() {
@@ -1434,14 +1431,13 @@ class LatencyQueue : public QueueBase {
14341431

14351432
//
14361433
bool BB_Scheduler::scheduleBlockForLatency(unsigned &MaxPressure,
1437-
bool ReassignID, unsigned UpperBoundGRF) {
1434+
bool ReassignID, unsigned KernelRP) {
14381435
auto tryLatencyHiding = [=](unsigned nr) {
14391436
if (!config.UseLatency)
14401437
return false;
14411438

1442-
// UpperBoundGRF == 0 means we are scheduling for the fixed number of GRF
1443-
if (UpperBoundGRF == 0 &&
1444-
MaxPressure >= getLatencyHidingThreshold(kernel, nr))
1439+
// KernelRP == 0 means we are scheduling for the fixed number of GRF
1440+
if (KernelRP == 0 && MaxPressure >= getLatencyHidingThreshold(kernel, nr))
14451441
return false;
14461442

14471443
// simple ROI check.
@@ -1464,15 +1460,17 @@ bool BB_Scheduler::scheduleBlockForLatency(unsigned &MaxPressure,
14641460
if (!tryLatencyHiding(NumGrfs))
14651461
return false;
14661462

1467-
// UpperBoundGRF == 0 means we only schedule under single NumGRF
1463+
// UpperBoundGRF == NumGrfs means we only schedule under single NumGRF
14681464
// setting for this block instead of trying to find the best schedule
1469-
// among multiple NumGRF settings.
1470-
if (UpperBoundGRF == 0)
1471-
UpperBoundGRF = NumGrfs;
1472-
1465+
// among multiple NumGRF setting.
1466+
unsigned UpperBoundGRF = NumGrfs;
14731467
unsigned SavedEstimation = 0;
14741468
std::vector<G4_INST *> SavedSchedule;
14751469

1470+
// multiple settings are applied only to some blocks to save time
1471+
if (KernelRP > 0 && MaxPressure > 40 && MaxPressure * 2 > KernelRP)
1472+
UpperBoundGRF = std::max(256U, UpperBoundGRF);
1473+
14761474
for (; NumGrfs <= UpperBoundGRF; NumGrfs += 32) {
14771475
// try grouping-threshold decremently until we find a schedule likely won't
14781476
// spill

visa/LocalScheduler/LocalScheduler_G4IR.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -343,12 +343,14 @@ class LocalScheduler {
343343

344344
class preRA_Scheduler {
345345
public:
346-
preRA_Scheduler(G4_Kernel &k);
346+
preRA_Scheduler(G4_Kernel &k, RPE *rpe);
347347
~preRA_Scheduler();
348-
bool run(unsigned &KernelPressure);
348+
bool run();
349349

350350
private:
351351
G4_Kernel &kernel;
352+
RPE *rpe;
353+
Options *m_options;
352354
};
353355

354356
class preRA_ACC_Scheduler {
@@ -365,12 +367,13 @@ class preRA_ACC_Scheduler {
365367

366368
class preRA_RegSharing {
367369
public:
368-
preRA_RegSharing(G4_Kernel &k);
370+
preRA_RegSharing(G4_Kernel &k, RPE *rpe);
369371
~preRA_RegSharing();
370-
bool run(unsigned &KernelPressure);
372+
bool run();
371373

372374
private:
373375
G4_Kernel &kernel;
376+
RPE *rpe;
374377
};
375378
// Restrictions of candidate for 2xDP:
376379
// 1, Only support SIMD16 DF mad with M0

visa/Optimizer.cpp

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -137,28 +137,6 @@ void Optimizer::insertFallThroughJump() {
137137
}
138138
}
139139

140-
void Optimizer::preRA_Schedule() {
141-
bool Changed = false;
142-
unsigned KernelPressure = 0;
143-
if (kernel.useRegSharingHeuristics()) {
144-
preRA_RegSharing Sched(kernel);
145-
Changed = Sched.run(KernelPressure);
146-
} else {
147-
preRA_Scheduler Sched(kernel);
148-
Changed = Sched.run(KernelPressure);
149-
}
150-
// Update Jit info for max register pressure
151-
kernel.fg.builder->getJitInfo()->stats.maxGRFPressure = KernelPressure;
152-
153-
unsigned GRFChange = (KernelPressure * 100) / kernel.getNumRegTotal();
154-
if (kernel.getOption(vISA_AbortOnSpill) &&
155-
GRFChange > ABORT_ON_SPILL_IF_RP_HIGH) {
156-
// If -abortOnSpill is set and register spills are anavoidable,
157-
// compilation is aborted.
158-
AbortHighRP = true;
159-
}
160-
}
161-
162140
void Optimizer::forceAssignRegs() {
163141
const char *rawStr =
164142
builder.getOptions()->getOptionCstr(vISA_ForceAssignRhysicalReg);

0 commit comments

Comments
 (0)