99// granted to it by virtue of its status as an Intergovernmental Organization
1010// or submit itself to any jurisdiction.
1111
12- // / \file GPUTPCCFCheckPadBaseline.h
12+ // / \file GPUTPCCFCheckPadBaseline.cxx
1313// / \author Felix Weiglhofer
1414
1515#include " GPUTPCCFCheckPadBaseline.h"
1616#include " CfArray2D.h"
1717#include " PackedCharge.h"
18- #include " GPUTPCGeometry.h"
1918#include " clusterFinderDefs.h"
2019
2120#ifndef GPUCA_GPUCODE
@@ -28,51 +27,88 @@ using namespace o2::gpu::tpccf;
2827template <>
2928GPUd () void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
3029{
31- const CfFragment& fragment = clusterer.mPmemory ->fragment ;
32- CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
33-
34- int32_t basePad = iBlock * PadsPerCacheline;
35- CfChargePos basePos = padToCfChargePos (basePad, clusterer);
30+ #ifdef GPUCA_GPUCODE
31+ CheckBaselineGPU (nBlocks, nThreads, iBlock, iThread, smem, clusterer);
32+ #else
33+ CheckBaselineCPU (nBlocks, nThreads, iBlock, iThread, smem, clusterer);
34+ #endif
35+ }
3636
37- if (not basePos.valid ()) {
37+ // Charges are stored in a 2D array (pad and time) using a tiling layout.
38+ // Tiles are 8 pads x 4 timebins large stored in time-major layout and make up a single cacheline.
39+ //
40+ // This kernel processes one row per block. Threads cooperatively load chunks
41+ // of 4 consecutive time bins for all pads into shared memory. Thread `i` then processes charges for pad `i` in shared memory.
42+ // Blocks require `nextMultipleOf<64>(138 * 4) = 576` threads to process the largest TPC rows with 138 pads correctly.
43+ GPUd () void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
44+ {
45+ #ifdef GPUCA_GPUCODE
46+ if (iBlock >= GPUCA_ROW_COUNT) {
3847 return ;
3948 }
4049
41- #ifdef GPUCA_GPUCODE
42- static_assert (TPC_MAX_FRAGMENT_LEN_GPU % NumOfCachedTimebins == 0 );
50+ const CfFragment& fragment = clusterer.mPmemory ->fragment ;
51+ CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
52+
53+ const auto iRow = iBlock;
54+ const auto rowinfo = GetRowInfo (iRow);
55+ const CfChargePos basePos{(Row)iRow, 0 , 0 };
4356
4457 int32_t totalCharges = 0 ;
4558 int32_t consecCharges = 0 ;
4659 int32_t maxConsecCharges = 0 ;
4760 Charge maxCharge = 0 ;
4861
49- int16_t localPadId = iThread / NumOfCachedTimebins;
50- int16_t localTimeBin = iThread % NumOfCachedTimebins;
51- bool handlePad = localTimeBin == 0 ;
62+ const int16_t iPadOffset = iThread % MaxNPadsPerRow;
63+ const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
64+ const int16_t iPadHandle = iThread;
65+ const bool handlePad = iPadHandle < rowinfo.nPads ;
66+
67+ const auto firstTB = fragment.firstNonOverlapTimeBin ();
68+ const auto lastTB = fragment.lastNonOverlapTimeBin ();
69+
70+ for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) {
71+
72+ const TPCFragmentTime iTime = t + iTimeOffset;
73+
74+ const CfChargePos pos = basePos.delta ({iPadOffset, iTime});
75+
76+ smem.charges [iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < rowinfo.nPads ? chargeMap[pos].unpack () : 0 ;
5277
53- for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin (); t < fragment.lastNonOverlapTimeBin (); t += NumOfCachedTimebins) {
54- const CfChargePos pos = basePos.delta ({localPadId, int16_t (t + localTimeBin)});
55- smem.charges [localPadId][localTimeBin] = (pos.valid ()) ? chargeMap[pos].unpack () : 0 ;
5678 GPUbarrier ();
79+
5780 if (handlePad) {
58- for (int32_t i = 0 ; i < NumOfCachedTimebins ; i++) {
59- const Charge q = smem.charges [localPadId][i ];
81+ for (int32_t i = 0 ; i < NumOfCachedTBs ; i++) {
82+ const Charge q = smem.charges [i][iPadHandle ];
6083 totalCharges += (q > 0 );
6184 consecCharges = (q > 0 ) ? consecCharges + 1 : 0 ;
6285 maxConsecCharges = CAMath::Max (consecCharges, maxConsecCharges);
6386 maxCharge = CAMath::Max<Charge>(q, maxCharge);
6487 }
6588 }
89+
6690 GPUbarrier ();
6791 }
6892
69- GPUbarrier ();
70-
7193 if (handlePad) {
72- updatePadBaseline (basePad + localPadId , clusterer, totalCharges, maxConsecCharges, maxCharge);
94+ updatePadBaseline (rowinfo. globalPadOffset + iPadOffset , clusterer, totalCharges, maxConsecCharges, maxCharge);
7395 }
96+ #endif
97+ }
7498
75- #else // CPU CODE
99+ GPUd () void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
100+ {
101+ #ifndef GPUCA_GPUCODE
102+ const CfFragment& fragment = clusterer.mPmemory ->fragment ;
103+ CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
104+
105+ int32_t basePad = iBlock * PadsPerCacheline;
106+ int32_t padsPerRow;
107+ CfChargePos basePos = padToCfChargePos<PadsPerCacheline>(basePad, clusterer, padsPerRow);
108+
109+ if (not basePos.valid ()) {
110+ return ;
111+ }
76112
77113 constexpr size_t ElemsInTileRow = (size_t )TilingLayout<GridSize<2 >>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
78114
@@ -122,25 +158,40 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
122158#endif
123159}
124160
125- GPUd () CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t & pad, const GPUTPCClusterFinder& clusterer)
161+ template <int32_t PadsPerBlock>
162+ GPUd () CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t & pad, const GPUTPCClusterFinder& clusterer, int32_t & padsPerRow)
126163{
127164 constexpr GPUTPCGeometry geo;
128165
129166 int32_t padOffset = 0 ;
130167 for (Row r = 0 ; r < GPUCA_ROW_COUNT; r++) {
131168 int32_t npads = geo.NPads (r);
132169 int32_t padInRow = pad - padOffset;
133- if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int32_t >( npads) ) {
134- int32_t cachelineOffset = padInRow % PadsPerCacheline ;
170+ if (0 <= padInRow && padInRow < npads) {
171+ int32_t cachelineOffset = padInRow % PadsPerBlock ;
135172 pad -= cachelineOffset;
173+ padsPerRow = npads;
136174 return CfChargePos{r, Pad (padInRow - cachelineOffset), 0 };
137175 }
138176 padOffset += npads;
139177 }
140178
179+ padsPerRow = 0 ;
141180 return CfChargePos{0 , 0 , INVALID_TIME_BIN};
142181}
143182
183+ GPUd () GPUTPCCFCheckPadBaseline::RowInfo GPUTPCCFCheckPadBaseline::GetRowInfo(int16_t row)
184+ {
185+ constexpr GPUTPCGeometry geo;
186+
187+ int16_t padOffset = 0 ;
188+ for (int16_t r = 0 ; r < row; r++) {
189+ padOffset += geo.NPads (r);
190+ }
191+
192+ return RowInfo{padOffset, geo.NPads (row)};
193+ }
194+
144195GPUd () void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge)
145196{
146197 const CfFragment& fragment = clusterer.mPmemory ->fragment ;
0 commit comments