@@ -31,7 +31,6 @@ static const unsigned LARGE_BLOCK_SIZE = 20000;
3131static const unsigned LARGE_BLOCK_SIZE_RPE = 32000 ;
3232static const unsigned PRESSURE_REDUCTION_MIN_BENEFIT = 5 ; // percentage
3333static const unsigned PRESSURE_REDUCTION_THRESHOLD = 110 ;
34- static const unsigned PRESSURE_LATENCY_HIDING_THRESHOLD = 104 ;
3534static const unsigned PRESSURE_HIGH_THRESHOLD = 128 ;
3635static const unsigned PRESSURE_REDUCTION_THRESHOLD_SIMD32 = 120 ;
3736
@@ -362,16 +361,12 @@ struct RegisterPressure {
362361 }
363362
364363 void recompute (G4_BB *BB) { rpe->runBB (BB); }
365- void recompute () { rpe->run (); }
366364
367365 // Return the register pressure in GRF for an instruction.
368366 unsigned getPressure (G4_INST *Inst) const {
369367 return rpe->getRegisterPressure (Inst);
370368 }
371369
372- // Return the max register pressure
373- unsigned getMaxRP () const { return rpe->getMaxRP (); }
374-
375370 // Return the max pressure in GRFs for this block.
376371 unsigned getPressure (G4_BB *bb, std::vector<G4_INST *> *Insts = nullptr ) {
377372 unsigned Max = 0 ;
@@ -495,7 +490,7 @@ class BB_Scheduler {
495490 // ReassignID of PreNodes when this is not 1st-round scheduling
496491 // KernelRP is the measure max reg-pressure of this kernel before scheduling
497492 bool scheduleBlockForLatency (unsigned &MaxPressure, bool ReassignID,
498- unsigned UpperBoundGRF );
493+ unsigned KernelRP );
499494
500495private:
501496 void SethiUllmanScheduling ();
@@ -550,29 +545,29 @@ static unsigned getLatencyHidingThreshold(G4_Kernel &kernel, unsigned NumGrfs) {
550545 unsigned RPThreshold =
551546 kernel.getOptions ()->getuInt32Option (vISA_preRA_ScheduleRPThreshold);
552547 if (RPThreshold == 0 ) {
553- RPThreshold = PRESSURE_LATENCY_HIDING_THRESHOLD ;
548+ RPThreshold = 104 ;
554549 }
555- return unsigned (RPThreshold * (std::max (NumGrfs, 128u ) - 48u ) / 80u );
550+ return unsigned (RPThreshold * (std::max (NumGrfs, 128u ) - 32u ) / 96u );
556551}
557552
558- preRA_Scheduler::preRA_Scheduler (G4_Kernel &k)
559- : kernel(k) {}
553+ preRA_Scheduler::preRA_Scheduler (G4_Kernel &k, RPE *rpe )
554+ : kernel(k), rpe(rpe), m_options(kernel.getOptions()) {}
560555
561556preRA_Scheduler::~preRA_Scheduler () {}
562557
563- bool preRA_Scheduler::run (unsigned &KernelPressure ) {
558+ bool preRA_Scheduler::run () {
564559 if (kernel.getInt32KernelAttr (Attributes::ATTR_Target) != VISA_3D) {
565560 // Do not run pre-RA scheduler for CM unless user forces it.
566- if (!kernel. getOption (vISA_preRA_ScheduleForce))
561+ if (!m_options-> getOption (vISA_preRA_ScheduleForce))
567562 return false ;
568563 }
569564
570565 unsigned Threshold = getRPReductionThreshold (kernel);
571- unsigned SchedCtrl = kernel. getuInt32Option (vISA_preRA_ScheduleCtrl);
566+ unsigned SchedCtrl = m_options-> getuInt32Option (vISA_preRA_ScheduleCtrl);
572567
573568 auto LT = LatencyTable::createLatencyTable (*kernel.fg .builder );
574569 SchedConfig config (SchedCtrl);
575- RegisterPressure rp (kernel, nullptr );
570+ RegisterPressure rp (kernel, rpe );
576571 // skip extreme test cases that scheduling does not good
577572 // if (kernel.fg.getNumBB() >= 10000 && rp.rpe->getMaxRP() >= 800)
578573 // return false;
@@ -585,16 +580,16 @@ bool preRA_Scheduler::run(unsigned &KernelPressure) {
585580 continue ;
586581 }
587582
588- if (kernel.getuInt32Option (vISA_ScheduleStartBBID) &&
583+ if (kernel.getOptions ()-> getuInt32Option (vISA_ScheduleStartBBID) &&
589584 (bb->getId () <
590- kernel.getuInt32Option (vISA_ScheduleStartBBID))) {
585+ kernel.getOptions ()-> getuInt32Option (vISA_ScheduleStartBBID))) {
591586 SCHED_DUMP (std::cerr << " Skip BB" << bb->getId () << " \n " );
592587 continue ;
593588 }
594589
595- if (kernel.getuInt32Option (vISA_ScheduleEndBBID) &&
590+ if (kernel.getOptions ()-> getuInt32Option (vISA_ScheduleEndBBID) &&
596591 (bb->getId () >
597- kernel.getuInt32Option (vISA_ScheduleEndBBID))) {
592+ kernel.getOptions ()-> getuInt32Option (vISA_ScheduleEndBBID))) {
598593 SCHED_DUMP (std::cerr << " Skip BB" << bb->getId () << " \n " );
599594 continue ;
600595 }
@@ -612,118 +607,120 @@ bool preRA_Scheduler::run(unsigned &KernelPressure) {
612607 Changed |= S.scheduleBlockForPressure (MaxPressure, Threshold);
613608 Changed |= S.scheduleBlockForLatency (MaxPressure, Changed, 0 );
614609 }
615-
616- if (Changed)
617- rp.recompute ();
618- KernelPressure = rp.getMaxRP ();
619-
610+ if (kernel.getOptions ()->getOption (vISA_PreSchedGRFPressure)) {
611+ rp.rpe ->run ();
612+ kernel.fg .builder ->getJitInfo ()->stats .maxGRFPressure = rp.rpe ->getMaxRP ();
613+ }
620614 return Changed;
621615}
622616
623- preRA_RegSharing::preRA_RegSharing (G4_Kernel &k)
624- : kernel(k) {}
617+ preRA_RegSharing::preRA_RegSharing (G4_Kernel &k, RPE *rpe )
618+ : kernel(k), rpe(rpe) {}
625619
626620preRA_RegSharing::~preRA_RegSharing () {}
627621
628- bool preRA_RegSharing::run (unsigned &KernelPressure) {
629-
622+ bool preRA_RegSharing::run () {
623+ // General algorithm:
624+ // 1. Schedule for pressure
625+ // - If RP is low (e.g. < 64, based on platform), set maximum number of
626+ // threads
627+ // 2. Estimate number of threads [4 .. 12] based on initial RP
628+ // 3. Schedule for latency (obtain ILP, stalls, throughput)
629+ // 4. Compute cost of schedule
630+ // 5. Based on schedule cost:
631+ // - Return ok (keep best schedule)
632+ // - Goto 3
630633
631634 if (kernel.getInt32KernelAttr (Attributes::ATTR_Target) != VISA_3D) {
632635 // Do not run pre-RA scheduler for CM unless user forces it.
633- if (!kernel.getOption (vISA_preRA_ScheduleForce))
636+ if (!kernel.getOptions ()-> getOption (vISA_preRA_ScheduleForce))
634637 return false ;
635638 }
636639
637- bool Changed = false ;
640+ bool changed = false ;
638641
639- unsigned SchedCtrl = kernel.getuInt32Option (vISA_preRA_ScheduleCtrl);
642+ unsigned SchedCtrl =
643+ kernel.getOptions ()->getuInt32Option (vISA_preRA_ScheduleCtrl);
640644 SchedConfig config (SchedCtrl);
641- RegisterPressure rp (kernel, nullptr );
642- KernelPressure = rp.getMaxRP ();
643- unsigned RPReductionThreshold = getRPReductionThreshold (kernel);
644- auto LT = LatencyTable::createLatencyTable (*kernel.fg .builder );
645645
646- // Schedule for reg pressure reduction if needed
646+ RegisterPressure rp (kernel, rpe);
647+
648+ std::unordered_map<G4_BB *, unsigned int > rpBB;
649+ unsigned KernelPressure = 0 ;
650+
651+ // Obtain register pressure estimate of every BB
647652 for (auto bb : kernel.fg ) {
648- // Skip BBs:
649- if (bb->size () < SMALL_BLOCK_SIZE || bb->size () > LARGE_BLOCK_SIZE) {
653+ if (bb->size () < SMALL_BLOCK_SIZE || bb->size () > LARGE_BLOCK_SIZE_RPE) {
650654 SCHED_DUMP (std::cerr << " Skip block with instructions " << bb->size ()
651655 << " \n " );
652656 continue ;
653657 }
654658
655- if (kernel.getuInt32Option (vISA_ScheduleStartBBID) &&
656- (bb->getId () < kernel.getuInt32Option (vISA_ScheduleStartBBID))) {
657- SCHED_DUMP (std::cerr << " Skip BB" << bb->getId () << " \n " );
658- continue ;
659- }
659+ unsigned pressure = rp.getPressure (bb);
660+ rpBB[bb] = pressure;
660661
661- if (kernel.getuInt32Option (vISA_ScheduleEndBBID) &&
662- (bb->getId () > kernel.getuInt32Option (vISA_ScheduleEndBBID))) {
663- SCHED_DUMP (std::cerr << " Skip BB" << bb->getId () << " \n " );
664- continue ;
662+ if (pressure > KernelPressure) {
663+ KernelPressure = pressure;
665664 }
666-
667- // Schedule:
668- SCHED_DUMP (rp.dump (bb, " Before scheduling for pressure reduction, " ));
669- preDDD ddd (kernel, bb);
670- BB_Scheduler S (kernel, ddd, rp, config, *LT);
671- unsigned BBRP = rp.getPressure (bb);
672- Changed |= S.scheduleBlockForPressure (BBRP, RPReductionThreshold);
673665 }
674666
675- if (Changed) {
676- // Re-compute register pressure estimation
677- rp.recompute ();
678- KernelPressure = rp.getMaxRP ();
667+ // Obs: Heuristic considering PVC with 2 GRF modes as of 03/2020
668+ // If maximum register pressure is higher than default GRF mode,
669+ // assign the smallest number of threads to this kernel.
670+ if (!kernel.getOptions ()->getuInt32Option (vISA_HWThreadNumberPerEU) &&
671+ (KernelPressure >
672+ kernel.getScaledGRFSize (PRESSURE_HIGH_THRESHOLD) -
673+ kernel.getOptions ()->getuInt32Option (vISA_ReservedGRFNum))) {
674+ // Update number of threads, GRF, Acc and SWSB
675+ kernel.updateKernelToLargerGRF ();
679676 }
680677
681- // Adjust GRF based on register pressure
682- unsigned oldGRFNum = kernel.getNumRegTotal ();
683- kernel.updateKernelByRegPressure (KernelPressure);
684- bool GRFdecreased = kernel.getNumRegTotal () < oldGRFNum;
685- Changed = false ;
678+ // skip extreme test cases that scheduling does not good
679+ // if (kernel.fg.getNumBB() >= 10000 && KernelPressure >= 800)
680+ // return false;
681+
682+ unsigned Threshold = getRPReductionThreshold (kernel);
683+ auto LT = LatencyTable::createLatencyTable (*kernel.fg .builder );
686684
687- // Schedule for latency hiding if needed
688685 for (auto bb : kernel.fg ) {
689- // Skip BBs:
690686 if (bb->size () < SMALL_BLOCK_SIZE || bb->size () > LARGE_BLOCK_SIZE) {
691687 SCHED_DUMP (std::cerr << " Skip block with instructions " << bb->size ()
692688 << " \n " );
693689 continue ;
694690 }
695691
696- if (kernel.getuInt32Option (vISA_ScheduleStartBBID) &&
697- (bb->getId () < kernel.getuInt32Option (vISA_ScheduleStartBBID))) {
692+ if (kernel.getOptions ()->getuInt32Option (vISA_ScheduleStartBBID) &&
693+ (bb->getId () <
694+ kernel.getOptions ()->getuInt32Option (vISA_ScheduleStartBBID))) {
698695 SCHED_DUMP (std::cerr << " Skip BB" << bb->getId () << " \n " );
699696 continue ;
700697 }
701698
702- if (kernel.getuInt32Option (vISA_ScheduleEndBBID) &&
703- (bb->getId () > kernel.getuInt32Option (vISA_ScheduleEndBBID))) {
699+ if (kernel.getOptions ()->getuInt32Option (vISA_ScheduleEndBBID) &&
700+ (bb->getId () >
701+ kernel.getOptions ()->getuInt32Option (vISA_ScheduleEndBBID))) {
704702 SCHED_DUMP (std::cerr << " Skip BB" << bb->getId () << " \n " );
705703 continue ;
706704 }
707705
708- // Schedule:
709- SCHED_DUMP (rp.dump (bb, " Before scheduling for latency hiding, " ));
706+ unsigned MaxPressure = rpBB.find (bb) == rpBB.end () ? 0 : rpBB[bb];
707+ if (MaxPressure <= Threshold && !config.UseLatency ) {
708+ SCHED_DUMP (std::cerr << " Skip block with rp " << MaxPressure << " \n " );
709+ continue ;
710+ }
711+
712+ SCHED_DUMP (rp.dump (bb, " Before scheduling, " ));
710713 preDDD ddd (kernel, bb);
711714 BB_Scheduler S (kernel, ddd, rp, config, *LT);
712- unsigned BBRP = rp.getPressure (bb);
713715
714- unsigned UpperBoundGRF = 0 ;
715- if (GRFdecreased && KernelPressure < kernel.grfMode .getMaxGRF ())
716- UpperBoundGRF = kernel.grfMode .getLargerGRF ();
717- Changed |= S.scheduleBlockForLatency (BBRP, Changed, UpperBoundGRF);
716+ changed |= S.scheduleBlockForPressure (MaxPressure, Threshold);
717+ changed |= S.scheduleBlockForLatency (MaxPressure, changed, 0 );
718718 }
719-
720- if (Changed) {
721- rp.recompute ();
722- KernelPressure = rp.getMaxRP ();
719+ if (kernel.getOptions ()->getOption (vISA_PreSchedGRFPressure)) {
720+ rp.rpe ->run ();
721+ kernel.fg .builder ->getJitInfo ()->stats .maxGRFPressure = rp.rpe ->getMaxRP ();
723722 }
724- kernel.updateKernelByRegPressure (KernelPressure);
725-
726- return Changed;
723+ return changed;
727724}
728725
729726bool BB_Scheduler::verifyScheduling () {
@@ -1434,14 +1431,13 @@ class LatencyQueue : public QueueBase {
14341431
14351432//
14361433bool BB_Scheduler::scheduleBlockForLatency (unsigned &MaxPressure,
1437- bool ReassignID, unsigned UpperBoundGRF ) {
1434+ bool ReassignID, unsigned KernelRP ) {
14381435 auto tryLatencyHiding = [=](unsigned nr) {
14391436 if (!config.UseLatency )
14401437 return false ;
14411438
1442- // UpperBoundGRF == 0 means we are scheduling for the fixed number of GRF
1443- if (UpperBoundGRF == 0 &&
1444- MaxPressure >= getLatencyHidingThreshold (kernel, nr))
1439+ // KernelRP == 0 means we are scheduling for the fixed number of GRF
1440+ if (KernelRP == 0 && MaxPressure >= getLatencyHidingThreshold (kernel, nr))
14451441 return false ;
14461442
14471443 // simple ROI check.
@@ -1464,15 +1460,17 @@ bool BB_Scheduler::scheduleBlockForLatency(unsigned &MaxPressure,
14641460 if (!tryLatencyHiding (NumGrfs))
14651461 return false ;
14661462
1467- // UpperBoundGRF == 0 means we only schedule under single NumGRF
1463+ // UpperBoundGRF == NumGrfs means we only schedule under single NumGRF
14681464 // setting for this block instead of trying to find the best schedule
1469- // among multiple NumGRF settings.
1470- if (UpperBoundGRF == 0 )
1471- UpperBoundGRF = NumGrfs;
1472-
1465+ // among multiple NumGRF setting.
1466+ unsigned UpperBoundGRF = NumGrfs;
14731467 unsigned SavedEstimation = 0 ;
14741468 std::vector<G4_INST *> SavedSchedule;
14751469
1470+ // multiple settings are applied only to some blocks to save time
1471+ if (KernelRP > 0 && MaxPressure > 40 && MaxPressure * 2 > KernelRP)
1472+ UpperBoundGRF = std::max (256U , UpperBoundGRF);
1473+
14761474 for (; NumGrfs <= UpperBoundGRF; NumGrfs += 32 ) {
14771475 // try grouping-threshold decremently until we find a schedule likely won't
14781476 // spill
0 commit comments