@@ -8572,50 +8572,34 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
85728572 const bool useInlineData = builder.getOption(vISA_useInlineData);
85738573
85748574 // preparation of thread payload size and start offsets
8575-
8576- // Payload in Memory Payload in GRF (T0)
8577- // (Prepared by Runtime)
8578- // (Does not contain inlineData)
8579- // ----------------------- R1 ----------------------- <-- perThreadLoadStartGRF
8580- // | cross thread data | \ | per thread data T0 |
8581- // | | numCrossThreadDW R4 -----------------------
8582- // | | / | inline data |
8583- // ----------------------- <-- localIDsOffset | (if enable) |
8584- // | per thread data T0 | R5 ----------------------- <-- crossThreadLoadStart, crossThreadLoadStartGRF
8585- // ----------------------- | cross thread data | \
8586- // | per thread data T1 | | | numCrossThreadDW
8587- // ----------------------- | | /
8588- // | ... | -----------------------
8589- // -----------------------
8590-
85918575 const uint32_t perThreadLoadStartGRF = kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
85928576 int PTIS = kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize);
85938577 uint32_t numPerThreadGRF = PTIS / kernel.numEltPerGRF<Type_UB>();
85948578 uint32_t crossThreadLoadStart = 0; // register file (grf) offset in byte
85958579 uint32_t crossThreadLoadStartGRF = 0; // grf number
85968580 // cross thread size (not including inlinedata size and alignement)
85978581 const uint32_t loadedCrossThreadInputSize = findLoadedInputSize(crossThreadLoadStart);
8598- // final cross thread size to be loaded as number of DW (including aligenment)
8599- uint32_t numCrossThreadDW = 0;
8582+ // final cross thread size to be loaded
8583+ uint32_t numCrossThreadGRF = 0;
86008584 // payload memory offset of where local id should be loaded from
86018585 uint32_t localIDsOffset = 0;
86028586 int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
86038587 if (CTIS < 0)
86048588 {
86058589 // per-thread payload vars
86068590 // N = inlinedata size
8607- // Cross thread data size is aligned to 32byte ,
8591+ // Payload is aligned to grf size ,
86088592 // if inlinedata is used, runtime puts first N bytes of payload in inlinedata.
86098593 // Rest of payload is shifted in the buffer by N bytes.
86108594 // So payload args which start at N offset, now start at 0 offset.
86118595 // Because of this we need to calculate localID offset:
86128596 const uint32_t inlineDataSize = builder.getInlineDataSize();
86138597 uint32_t correction = useInlineData ? inlineDataSize : 0;
8614- localIDsOffset = AlignUp(loadedCrossThreadInputSize + correction, 32 );
8598+ localIDsOffset = AlignUp(loadedCrossThreadInputSize + correction, kernel.getGRFSize() );
86158599 localIDsOffset -= useInlineData ? inlineDataSize : 0;
86168600
86178601 // cross-thread payload vars
8618- numCrossThreadDW = AlignUp(loadedCrossThreadInputSize, 32) / TypeSize(Type_UD );
8602+ numCrossThreadGRF = AlignUp(loadedCrossThreadInputSize, kernel.getGRFSize()) / kernel.numEltPerGRF<Type_UB>( );
86198603 crossThreadLoadStartGRF = crossThreadLoadStart / kernel.getGRFSize();
86208604 }
86218605 else
@@ -8625,13 +8609,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
86258609 localIDsOffset -= useInlineData ? kernel.getGRFSize() : 0;
86268610
86278611 // cross-thread payload vars
8628- numCrossThreadDW = CTIS / TypeSize(Type_UD );
8612+ numCrossThreadGRF = CTIS / kernel.numEltPerGRF<Type_UB>( );
86298613 crossThreadLoadStartGRF = perThreadLoadStartGRF + numPerThreadGRF;
86308614 if (useInlineData)
86318615 {
86328616 // first GRF of cross-thread data is already loaded
86338617 crossThreadLoadStartGRF++;
8634- numCrossThreadDW -= builder.getInlineDataSize() / TypeSize(Type_UD) ;
8618+ numCrossThreadGRF-- ;
86358619 }
86368620 }
86378621
@@ -8661,21 +8645,18 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
86618645
86628646 // load <numGRF> GRFs from the address "loadAddress", starting from <startGRF>
86638647 auto loadFromMemory = [this, &instBuffer, getHWordBlockEncoding](
8664- G4_Declare* loadAddress, uint32_t startGRF, uint32_t numTotalDW )
8648+ G4_Declare* loadAddress, uint32_t startGRF, uint32_t numGRF )
86658649 {
8666- for (uint32_t numRemainingDW = numTotalDW, nextGRF = startGRF; numRemainingDW > 0; /* updated in body */)
8650+ bool useHword = builder.hasHWordBlockLoad();
8651+ for (int numRemaining = numGRF, nextGRF = startGRF; numRemaining > 0; /* updated in body */)
86678652 {
8668- // can load 4, 2 or 1 grf per send.
8669- // Still load 1 GRF if the remainingDW is less than 1 GRF. The addtional bytes those being loaded won't be used.
8670- uint32_t DWin4GRF = 4 * builder.numEltPerGRF<Type_UD>();
8671- uint32_t DWin2GRF = DWin4GRF / 2;
8672- uint32_t DWin1GRF = DWin2GRF / 2;
8673- uint32_t numGRFToLoad =
8674- numRemainingDW >= DWin4GRF ? 4 : // 4 GRF
8675- numRemainingDW >= DWin2GRF ? 2 : // 2 GRF
8676- 1; // 1 GRF or less than 1 GRF
8677-
8678- bool useHword = builder.hasHWordBlockLoad();
8653+ int numGRFToLoad = numRemaining > 2 ? 4 : numRemaining;
8654+ if (numRemaining == 3)
8655+ {
8656+ // we can't do 4GRF load since it may overwrite values pushed from inline data,
8657+ // break load to 2+1 instead
8658+ numGRFToLoad = 2;
8659+ }
86798660 uint32_t numElts = (numGRFToLoad * kernel.getGRFSize()) / (useHword ? 32 : 16);
86808661 uint32_t dataBlocks = useHword ? getHWordBlockEncoding(numElts) :
86818662 (numElts == 2 ? 2 : (numElts == 4 ? 3 : 4));
@@ -8690,11 +8671,9 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
86908671 auto sendInst = builder.createSendInst(nullptr, G4_send, g4::SIMD8, sendDst, sendSrc,
86918672 builder.createImm(msgDescVal, Type_UD), InstOpt_WriteEnable, desc, true);
86928673 instBuffer.push_back(sendInst);
8693- if (numRemainingDW < DWin1GRF)
8694- break;
8695- numRemainingDW -= numGRFToLoad * builder.numEltPerGRF<Type_UD>();
8674+ numRemaining -= numGRFToLoad;
86968675 nextGRF += numGRFToLoad;
8697- if (numRemainingDW > 0)
8676+ if (numRemaining > 0)
86988677 {
86998678 // advance the address offset
87008679 // (W) add (1) loadAddress.2 loadAddress.2 numGRFToLoad*32
@@ -8708,36 +8687,18 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87088687 }
87098688 }
87108689 };
8711-
8712- // a helper function for loadFromMemoryLSC to get the max DW number which can fulfill
8713- // LSC element number
8714- auto getMaxNumDWforLscElementRequirement = [this](uint32_t numDW) {
8715- if (builder.lscGetElementNum(numDW) != LSC_DATA_ELEMS_INVALID)
8716- return numDW;
8717- if (numDW > builder.numEltPerGRF<Type_UD>()) {
8718- if (numDW > 64) return (uint32_t)64;
8719- else if (numDW > 32) return (uint32_t)32;
8720- else if (numDW > 16) return (uint32_t)16;
8721- else if (numDW > 8) return (uint32_t)8;
8722- assert(0 && "unreachable");
8723- }
8724- // when the numDW is less than 1 grf, we want to load all within one send
8725- // The additional bytes being loaded won't be used so should be fine
8726- if (numDW < 2) return (uint32_t)2;
8727- else if (numDW < 4) return (uint32_t)4;
8728- else if (numDW < 8) return (uint32_t)8;
8729- else if (numDW < 16) return (uint32_t)16;
8730- assert(0 && "unreachable");
8731- return (uint32_t)0;
8732- };
8733-
8734- auto loadFromMemoryLSC = [this, &instBuffer, &getMaxNumDWforLscElementRequirement](
8735- G4_Declare* loadAddress, uint32_t startGRF, uint32_t numTotalDW)
8690+ auto loadFromMemoryLSC = [this, &instBuffer](
8691+ G4_Declare* loadAddress, uint32_t startGRF, uint32_t numGRF)
87368692 {
87378693 const auto ADDR_TYPE = LSC_ADDR_TYPE_BTI;
87388694
8739- for (uint32_t numRemainingDW = numTotalDW , nextGRF = startGRF; numRemainingDW > 0; /* updated in body */)
8695+ for (int numRemaining = numGRF , nextGRF = startGRF; numRemaining > 0; /* updated in body */)
87408696 {
8697+ int numGRFToLoad =
8698+ numRemaining > 4 ? 4 :
8699+ numRemaining == 3 ? 2 : // split to 2+1
8700+ numRemaining; // 2 or 1
8701+
87418702 // Generate a A32 tranpose LSC load to BTI 255. size is d32x{16/32}t
87428703 LSC_OP op = LSC_LOAD;
87438704 LSC_SFID lscSfid = LSC_UGM;
@@ -8748,16 +8709,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87488709 addrInfo.immScale = 1;
87498710 addrInfo.immOffset = 0;
87508711 addrInfo.size = LSC_ADDR_SIZE_32b;
8751-
8712+ auto numDW = numGRFToLoad * (kernel.getGRFSize() / 4);
87528713 LSC_DATA_SHAPE dataShape { };
87538714 dataShape.size = LSC_DATA_SIZE_32b; //in the unit of 32b
87548715 dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
8755- uint32_t numDWToLoad = getMaxNumDWforLscElementRequirement(numRemainingDW);
8756- dataShape.elems = builder.lscGetElementNum(numDWToLoad);
8716+ dataShape.elems = builder.lscGetElementNum(numDW);
87578717
87588718 G4_Imm* surfaceBTI = builder.createImm(255, Type_UW);
87598719
8760- auto sendDstDcl = builder.createHardwiredDeclare(numDWToLoad , Type_UD, nextGRF, 0);
8720+ auto sendDstDcl = builder.createHardwiredDeclare(numDW , Type_UD, nextGRF, 0);
87618721 auto dstRead = builder.createDstRegRegion(sendDstDcl, 1);
87628722 auto src0Addr = builder.createSrcRegRegion(loadAddress, builder.getRegionStride1()); // address base
87638723
@@ -8769,7 +8729,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87698729 addrInfo,
87708730 dataShape,
87718731 surfaceBTI,
8772- numDWToLoad < builder.numEltPerGRF<Type_UD>() ? 1 : numDWToLoad / builder.numEltPerGRF<Type_UD>() ,
8732+ numGRFToLoad ,
87738733 1);
87748734
87758735 G4_InstSend *sendInst = builder.createLscSendInst(
@@ -8784,19 +8744,15 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
87848744 true);
87858745
87868746 instBuffer.push_back(sendInst);
8787- // we pick to load all data within one send in getMaxNumDWforLscElementRequirement if
8788- // numRemainingDW is less than one grf. All should be loaded at this point.
8789- if (numRemainingDW < builder.numEltPerGRF<Type_UD>())
8790- break;
8791- numRemainingDW -= numDWToLoad;
8792- nextGRF += numDWToLoad / builder.numEltPerGRF<Type_UD>();
8793- bool advanceLoadAddress = numRemainingDW > 0;
8747+ numRemaining -= numGRFToLoad;
8748+ nextGRF += numGRFToLoad;
8749+ bool advanceLoadAddress = numRemaining > 0;
87948750 if (advanceLoadAddress)
87958751 {
87968752 // advance the address offset
87978753 // (W) add (1) loadAddress.0 loadAddress.0 numGRFToLoad*32
87988754 auto addSrc0 = builder.createSrcRegRegion(loadAddress, builder.getRegionScalar());
8799- auto addSrc1 = builder.createImm(numDWToLoad * TypeSize(Type_UD ), Type_UW);
8755+ auto addSrc1 = builder.createImm(numGRFToLoad * kernel.numEltPerGRF<Type_UB>( ), Type_UW);
88008756 auto addDst = builder.createDst(loadAddress->getRegVar(), 0, 0, 1, Type_UD);
88018757 auto addInst = builder.createBinOp(G4_add, g4::SIMD1, addDst,
88028758 addSrc0, addSrc1, InstOpt_WriteEnable, false);
@@ -8955,11 +8911,11 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
89558911
89568912 if (useLSC)
89578913 {
8958- loadFromMemoryLSC(rtail, perThreadLoadStartGRF, numPerThreadGRF * builder.numEltPerGRF<Type_UD>() );
8914+ loadFromMemoryLSC(rtail, perThreadLoadStartGRF, numPerThreadGRF);
89598915 }
89608916 else
89618917 {
8962- loadFromMemory(rtail, perThreadLoadStartGRF, numPerThreadGRF * builder.numEltPerGRF<Type_UD>() );
8918+ loadFromMemory(rtail, perThreadLoadStartGRF, numPerThreadGRF);
89638919 }
89648920 perThreadBB = kernel.fg.createNewBB();
89658921 perThreadBB->insert(perThreadBB->begin(), instBuffer.begin(), instBuffer.end());
@@ -9002,11 +8958,11 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
90028958 {
90038959 if (useLSC)
90048960 {
9005- loadFromMemoryLSC(rtail, crossThreadLoadStartGRF, numCrossThreadDW );
8961+ loadFromMemoryLSC(rtail, crossThreadLoadStartGRF, numCrossThreadGRF );
90068962 }
90078963 else
90088964 {
9009- loadFromMemory(rtail, crossThreadLoadStartGRF, numCrossThreadDW );
8965+ loadFromMemory(rtail, crossThreadLoadStartGRF, numCrossThreadGRF );
90108966 }
90118967 }
90128968
0 commit comments