@@ -192,8 +192,7 @@ uint EmitPass::DecideInstanceAndSlice(llvm::BasicBlock& blk, SDAG& sdag, bool& s
192192 if (StoreInst * ST = dyn_cast<StoreInst>(sdag.m_root))
193193 {
194194 // Limit to OpenCL so far as it has uniform load/store support.
195- if (m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER &&
196- isUniformStoreOCL(ST))
195+ if (isUniformStoreOCL(ST))
197196 numInstance = 1;
198197 slicing = false;
199198 }
@@ -14338,12 +14337,14 @@ void EmitPass::emitftoi(llvm::GenIntrinsicInst* inst)
1433814337// Return true if this store will be emit as uniform store
1433914338bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
1434014339{
14341- if (!m_currShader->GetIsUniform(SI->getPointerOperand()))
14340+ if (m_currShader->GetShaderType() != ShaderType::OPENCL_SHADER ||
14341+ !m_currShader->GetIsUniform(SI->getPointerOperand()))
1434214342 {
1434314343 return false;
1434414344 }
1434514345
14346- Type* Ty = SI->getValueOperand()->getType();
14346+ Value* storeVal = SI->getValueOperand();
14347+ Type* Ty = storeVal->getType();
1434714348 VectorType* VTy = dyn_cast<VectorType>(Ty);
1434814349 uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
1434914350 Type* eltTy = VTy ? VTy->getElementType() : Ty;
@@ -14354,8 +14355,8 @@ bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
1435414355 // Note that when elts > 1, VectorProcess make sure that its element
1435514356 // size must be 4 or 8. Also, note that if totalBytes = 4, elts must be 1.
1435614357 bool doUniformStore = (elts == 1 ||
14357- (m_currShader->GetIsUniform(SI->getValueOperand() ) &&
14358- (totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
14358+ (m_currShader->GetIsUniform(storeVal ) &&
14359+ (totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
1435914360 return doUniformStore;
1436014361}
1436114362
@@ -15685,6 +15686,215 @@ void EmitPass::emitVectorStore(StoreInst* inst, Value* offset, ConstantInt* immO
1568515686 }
1568615687}
1568715688
15689+ // prepareAddressForUniform(): for both load and store
15690+ // prepareDataForUniform(): for store only
15691+ // Unaligned (less than 4 bytes) uniform load/store. One for address payload,
15692+ // and the other for data payload.
15693+ //
15694+ // Example 1: "store <4xi32> V, <4xi32>* P, align 2"
15695+ // A new pointer pVar is create with 4 elements.
15696+ //
15697+ // add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0xC840:UV
15698+ // send (4|M0_NM) pVar V
15699+ //
15700+ // prepareAddressForUniform() : create pVar
15701+ // prepareDataForUniform() : return V (assuming V can be used directly)
15702+ //
15703+ // Example 2: "store <3xi32> V, <3xi32>* P, align 2"
15704+ // Non-power of 2 vector size is rounded up to the next power of 2.
15705+ // Additional elements are duplicated with the first vector element.
15706+
15707+ // add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0x0840:UV
15708+ // mov (4|M0_NM) vVar<1>:ud V<0;1,0>:ud
15709+ // mov (2|M0_NM) vVar<1>:ud V<1;1,0>:ud
15710+ // mov (1|M0_NM) vVar.2<1>:ud V.2<1;1,0>:ud
15711+ // send (4|M0_NM) vVar pVar
15712+ //
15713+ // prepareAddressForUniform() : create pVar
15714+ // prepareDataForUniform() : return vVar
15715+ //
15716+ // This function handles vector size up to 8. It also handles QW element size.
15717+ // When vector size > 4, it uses 0x76543210, left-shifted by 2 (DW) or 3 (QW)
15718+ // as an immediate to be added to 'AddrVar' to form a new address var.
15719+ //
15720+ // In addition, if 64bit add is not supported, emitAddPair() will be used to
15721+ // use 32bit add/addc to emulate 64bit add.
15722+ //
15723+ // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is its return var.
15724+ // The argument 'DataVar' in prepareDataForUniform() is uniform, so is its return var.
15725+ //
15726+ CVariable* EmitPass::prepareAddressForUniform(
15727+ CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t ExecSz, e_alignment Align)
15728+ {
15729+ IGC_ASSERT(NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15730+ if (ExecSz == 1 && AddrVar->IsGRFAligned(Align))
15731+ {
15732+ return AddrVar;
15733+ }
15734+ bool isA64 = (AddrVar->GetElemSize() == 8);
15735+ SIMDMode simdmode = lanesToSIMDMode(ExecSz);
15736+ CVariable* newVar = m_currShader->GetNewVariable(ExecSz, AddrVar->GetType(), Align, true, CName::NONE);
15737+
15738+ CVariable* off;
15739+ uint32_t incImm = (0x76543210 & maskTrailingOnes<uint32_t>(NElts * 4));
15740+ if ((ExecSz <= 4 && EltBytes == 4) || (ExecSz <= 2 && EltBytes == 8))
15741+ {
15742+ // This case needs a single UV immediate
15743+ incImm = incImm << (EltBytes == 4 ? 2 : 3);
15744+ off = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
15745+ }
15746+ else
15747+ {
15748+ // Need a temporary var to calculate offsets
15749+ off = m_currShader->GetNewVariable(ExecSz, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
15750+
15751+ // actualES is the actual execsize used for computing offsets.
15752+ uint32_t actualES = (uint32_t)PowerOf2Ceil(NElts);
15753+
15754+ // incImm is UV type and can be used in execsize <= 8 only. If ExecSz is greater
15755+ // than the actual number of lanes (for example, 4GRF alignment case), the upper lanes
15756+ // beyond need to be zero'ed.
15757+ if (ExecSz > actualES)
15758+ {
15759+ // Need to zero the upper lanes.
15760+ m_encoder->SetNoMask();
15761+ m_encoder->SetSimdSize(simdmode);
15762+ m_encoder->Copy(off, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
15763+ m_encoder->Push();
15764+ }
15765+
15766+ SIMDMode sm = lanesToSIMDMode(actualES);
15767+ if (incImm > 0 &&
15768+ ((actualES <= 4 && EltBytes == 4) || (actualES <= 2 && EltBytes == 8)))
15769+ {
15770+ // This case needs a single UV immediate
15771+ incImm = incImm << (EltBytes == 4 ? 2 : 3);
15772+
15773+ m_encoder->SetNoMask();
15774+ m_encoder->SetSimdSize(sm);
15775+ m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15776+ m_encoder->Push();
15777+ }
15778+ else if (incImm > 0)
15779+ {
15780+ // Need a mov and mul
15781+ m_encoder->SetNoMask();
15782+ m_encoder->SetSimdSize(sm);
15783+ m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15784+ m_encoder->Push();
15785+
15786+ m_encoder->SetNoMask();
15787+ m_encoder->SetSimdSize(sm);
15788+ m_encoder->SetSrcRegion(0, 1, 1, 0);
15789+ m_encoder->SetSrcRegion(1, 0, 1, 0);
15790+ m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
15791+ m_encoder->Push();
15792+ }
15793+ }
15794+
15795+ // May need splitting for A64
15796+ bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15797+ if (needSplit)
15798+ {
15799+ IGC_ASSERT(!off->IsImmediate());
15800+ uint32_t bytes1 = (ExecSz / 2) * newVar->GetElemSize();
15801+ uint32_t bytes2 = (ExecSz / 2) * off->GetElemSize();
15802+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, ExecSz / 2);
15803+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, ExecSz / 2);
15804+ CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, ExecSz / 2);
15805+ CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, ExecSz / 2);
15806+
15807+ if (m_currShader->m_Platform->hasNoInt64Inst())
15808+ {
15809+ emitAddPair(newVarHi, AddrVar, offHi);
15810+ emitAddPair(newVarLo, AddrVar, offLo);
15811+ }
15812+ else
15813+ {
15814+ SIMDMode sm = lanesToSIMDMode(ExecSz / 2);
15815+ m_encoder->SetNoMask();
15816+ m_encoder->SetUniformSIMDSize(sm);
15817+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15818+ m_encoder->SetSrcRegion(1, 1, 1, 0);
15819+ m_encoder->Add(newVarHi, AddrVar, offHi);
15820+ m_encoder->Push();
15821+
15822+ m_encoder->SetNoMask();
15823+ m_encoder->SetUniformSIMDSize(sm);
15824+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15825+ m_encoder->SetSrcRegion(1, 1, 1, 0);
15826+ m_encoder->Add(newVarLo, AddrVar, offLo);
15827+ m_encoder->Push();
15828+ }
15829+ }
15830+ else if (isA64 && m_currShader->m_Platform->hasNoInt64Inst())
15831+ {
15832+ emitAddPair(newVar, AddrVar, off);
15833+ }
15834+ else
15835+ {
15836+ m_encoder->SetNoMask();
15837+ m_encoder->SetUniformSIMDSize(simdmode);
15838+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15839+ m_encoder->SetSrcRegion(1, 1, 1, 0);
15840+ m_encoder->Add(newVar, AddrVar, off);
15841+ m_encoder->Push();
15842+ }
15843+ return newVar;
15844+ }
15845+
15846+ CVariable* EmitPass::prepareDataForUniform(
15847+ CVariable* DataVar, uint32_t ExecSz, e_alignment Align)
15848+ {
15849+ uint32_t NElts = DataVar->GetNumberElement();
15850+ uint32_t EltBytes = DataVar->GetElemSize();
15851+ IGC_ASSERT(ExecSz >= NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15852+ if (NElts == ExecSz && !DataVar->IsImmediate() && DataVar->IsGRFAligned(Align))
15853+ {
15854+ return DataVar;
15855+ }
15856+ CVariable* newVar = m_currShader->GetNewVariable(ExecSz, DataVar->GetType(), Align, true, CName::NONE);
15857+
15858+ // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
15859+ bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15860+ if (needSplit)
15861+ {
15862+ uint32_t esz = ExecSz / 2;
15863+ uint32_t bytes = esz * newVar->GetElemSize();
15864+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
15865+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
15866+
15867+ m_encoder->SetNoMask();
15868+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15869+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15870+ m_encoder->Copy(newVarHi, DataVar);
15871+ m_encoder->Push();
15872+
15873+ m_encoder->SetNoMask();
15874+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15875+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15876+ m_encoder->Copy(newVarLo, DataVar);
15877+ m_encoder->Push();
15878+ }
15879+ else
15880+ {
15881+
15882+ m_encoder->SetNoMask();
15883+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(ExecSz));
15884+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15885+ m_encoder->Copy(newVar, DataVar);
15886+ m_encoder->Push();
15887+ }
15888+
15889+ if (!DataVar->IsImmediate() && NElts > 1)
15890+ {
15891+ // Copy values over, the elements from NElts to ExecSz-1 are set to the first element
15892+ // in the initialization above.
15893+ emitVectorCopy(newVar, DataVar, NElts);
15894+ }
15895+ return newVar;
15896+ }
15897+
1568815898
1568915899void EmitPass::emitVectorCopy(CVariable* Dst, CVariable* Src, uint32_t nElts,
1569015900 uint32_t DstSubRegOffset, uint32_t SrcSubRegOffset)
0 commit comments