Skip to content

Commit 3669ad3

Browse files
fweigdavidrohr
authored andcommitted
GPU: Parallelize TPC pad filter over pad rows instead of cachelines.
1 parent 52b0e23 commit 3669ad3

File tree

5 files changed

+118
-35
lines changed

5 files changed

+118
-35
lines changed

GPU/GPUTracking/DataTypes/GPUTPCGeometry.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class GPUTPCGeometry // TODO: Make values constexpr
9696
GPUd() static constexpr int32_t EndIROC() { return 63; }
9797
GPUd() static constexpr int32_t EndOROC1() { return 97; }
9898
GPUd() static constexpr int32_t EndOROC2() { return 127; }
99+
GPUd() static constexpr int32_t MaxNPadsPerRow() { return 138; }
99100
#else
100101
GPUd() static constexpr int32_t GetRegion(int32_t row) { return (row < 63 ? 0 : row < 63 + 64 ? 1 : 2); }
101102
GPUd() static constexpr int32_t GetRegionRows(int32_t region) { return 0; } // dummy
@@ -104,6 +105,7 @@ class GPUTPCGeometry // TODO: Make values constexpr
104105
GPUd() static constexpr int32_t EndIROC() { return 63; }
105106
GPUd() static constexpr int32_t EndOROC1() { return 63 + 64; }
106107
GPUd() static constexpr int32_t EndOROC2() { return GPUCA_ROW_COUNT; }
108+
GPUd() static constexpr int32_t MaxNPadsPerRow() { return 140; }
107109
#endif
108110

109111
GPUd() static constexpr float TPCLength() { return 250.f - 0.275f; }

GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
6969
#define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128, 2
7070
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 2
71-
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64, 10
71+
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 576, 2
7272
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
7373
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
7474
#define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart 512
@@ -133,7 +133,7 @@
133133
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
134134
#define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128, 2
135135
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 64, 2
136-
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64, 2
136+
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 576, 2
137137
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
138138
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
139139
#define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart 512
@@ -197,7 +197,7 @@
197197
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3
198198
#define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 32, 1
199199
#define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 32, 1
200-
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8
200+
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 576,2
201201
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448
202202
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 448
203203
#define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart 448
@@ -447,7 +447,7 @@
447447
#define GPUCA_LB_GPUTPCStartHitsSorter 256
448448
#endif
449449
#ifndef GPUCA_LB_GPUTPCCFCheckPadBaseline
450-
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
450+
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 576
451451
#endif
452452
#ifndef GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap
453453
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -962,7 +962,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
962962
checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
963963

964964
if (checkForNoisyPads) {
965-
int32_t nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline;
965+
const int32_t nBlocks = GPUTPCCFCheckPadBaseline::GetNBlocks(doGPU);
966966

967967
runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
968968
getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector, TPC_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false);

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx

Lines changed: 76 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,12 @@
99
// granted to it by virtue of its status as an Intergovernmental Organization
1010
// or submit itself to any jurisdiction.
1111

12-
/// \file GPUTPCCFCheckPadBaseline.h
12+
/// \file GPUTPCCFCheckPadBaseline.cxx
1313
/// \author Felix Weiglhofer
1414

1515
#include "GPUTPCCFCheckPadBaseline.h"
1616
#include "CfArray2D.h"
1717
#include "PackedCharge.h"
18-
#include "GPUTPCGeometry.h"
1918
#include "clusterFinderDefs.h"
2019

2120
#ifndef GPUCA_GPUCODE
@@ -28,51 +27,88 @@ using namespace o2::gpu::tpccf;
2827
template <>
2928
GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
3029
{
31-
const CfFragment& fragment = clusterer.mPmemory->fragment;
32-
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
33-
34-
int32_t basePad = iBlock * PadsPerCacheline;
35-
CfChargePos basePos = padToCfChargePos(basePad, clusterer);
30+
#ifdef GPUCA_GPUCODE
31+
CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
32+
#else
33+
CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
34+
#endif
35+
}
3636

37-
if (not basePos.valid()) {
37+
// Charges are stored in a 2D array (pad and time) using a tiling layout.
38+
// Tiles are 8 pads x 4 timebins large stored in time-major layout and make up a single cacheline.
39+
//
40+
// This kernel processes one row per block. Threads cooperatively load chunks
41+
// of 4 consecutive time bins for all pads into shared memory. Thread `i` then processes charges for pad `i` in shared memory.
42+
// Blocks require `nextMultipleOf<64>(138 * 4) = 576` threads to process the largest TPC rows with 138 pads correctly.
43+
GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
44+
{
45+
#ifdef GPUCA_GPUCODE
46+
if (iBlock >= GPUCA_ROW_COUNT) {
3847
return;
3948
}
4049

41-
#ifdef GPUCA_GPUCODE
42-
static_assert(TPC_MAX_FRAGMENT_LEN_GPU % NumOfCachedTimebins == 0);
50+
const CfFragment& fragment = clusterer.mPmemory->fragment;
51+
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
52+
53+
const auto iRow = iBlock;
54+
const auto rowinfo = GetRowInfo(iRow);
55+
const CfChargePos basePos{(Row)iRow, 0, 0};
4356

4457
int32_t totalCharges = 0;
4558
int32_t consecCharges = 0;
4659
int32_t maxConsecCharges = 0;
4760
Charge maxCharge = 0;
4861

49-
int16_t localPadId = iThread / NumOfCachedTimebins;
50-
int16_t localTimeBin = iThread % NumOfCachedTimebins;
51-
bool handlePad = localTimeBin == 0;
62+
const int16_t iPadOffset = iThread % MaxNPadsPerRow;
63+
const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
64+
const int16_t iPadHandle = iThread;
65+
const bool handlePad = iPadHandle < rowinfo.nPads;
66+
67+
const auto firstTB = fragment.firstNonOverlapTimeBin();
68+
const auto lastTB = fragment.lastNonOverlapTimeBin();
69+
70+
for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) {
71+
72+
const TPCFragmentTime iTime = t + iTimeOffset;
73+
74+
const CfChargePos pos = basePos.delta({iPadOffset, iTime});
75+
76+
smem.charges[iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < rowinfo.nPads ? chargeMap[pos].unpack() : 0;
5277

53-
for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
54-
const CfChargePos pos = basePos.delta({localPadId, int16_t(t + localTimeBin)});
55-
smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0;
5678
GPUbarrier();
79+
5780
if (handlePad) {
58-
for (int32_t i = 0; i < NumOfCachedTimebins; i++) {
59-
const Charge q = smem.charges[localPadId][i];
81+
for (int32_t i = 0; i < NumOfCachedTBs; i++) {
82+
const Charge q = smem.charges[i][iPadHandle];
6083
totalCharges += (q > 0);
6184
consecCharges = (q > 0) ? consecCharges + 1 : 0;
6285
maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges);
6386
maxCharge = CAMath::Max<Charge>(q, maxCharge);
6487
}
6588
}
89+
6690
GPUbarrier();
6791
}
6892

69-
GPUbarrier();
70-
7193
if (handlePad) {
72-
updatePadBaseline(basePad + localPadId, clusterer, totalCharges, maxConsecCharges, maxCharge);
94+
updatePadBaseline(rowinfo.globalPadOffset + iPadOffset, clusterer, totalCharges, maxConsecCharges, maxCharge);
7395
}
96+
#endif
97+
}
7498

75-
#else // CPU CODE
99+
GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
100+
{
101+
#ifndef GPUCA_GPUCODE
102+
const CfFragment& fragment = clusterer.mPmemory->fragment;
103+
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
104+
105+
int32_t basePad = iBlock * PadsPerCacheline;
106+
int32_t padsPerRow;
107+
CfChargePos basePos = padToCfChargePos<PadsPerCacheline>(basePad, clusterer, padsPerRow);
108+
109+
if (not basePos.valid()) {
110+
return;
111+
}
76112

77113
constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
78114

@@ -122,25 +158,40 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
122158
#endif
123159
}
124160

125-
GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer)
161+
template <int32_t PadsPerBlock>
162+
GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer, int32_t& padsPerRow)
126163
{
127164
constexpr GPUTPCGeometry geo;
128165

129166
int32_t padOffset = 0;
130167
for (Row r = 0; r < GPUCA_ROW_COUNT; r++) {
131168
int32_t npads = geo.NPads(r);
132169
int32_t padInRow = pad - padOffset;
133-
if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int32_t>(npads)) {
134-
int32_t cachelineOffset = padInRow % PadsPerCacheline;
170+
if (0 <= padInRow && padInRow < npads) {
171+
int32_t cachelineOffset = padInRow % PadsPerBlock;
135172
pad -= cachelineOffset;
173+
padsPerRow = npads;
136174
return CfChargePos{r, Pad(padInRow - cachelineOffset), 0};
137175
}
138176
padOffset += npads;
139177
}
140178

179+
padsPerRow = 0;
141180
return CfChargePos{0, 0, INVALID_TIME_BIN};
142181
}
143182

183+
GPUd() GPUTPCCFCheckPadBaseline::RowInfo GPUTPCCFCheckPadBaseline::GetRowInfo(int16_t row)
184+
{
185+
constexpr GPUTPCGeometry geo;
186+
187+
int16_t padOffset = 0;
188+
for (int16_t r = 0; r < row; r++) {
189+
padOffset += geo.NPads(r);
190+
}
191+
192+
return RowInfo{padOffset, geo.NPads(row)};
193+
}
194+
144195
GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge)
145196
{
146197
const CfFragment& fragment = clusterer.mPmemory->fragment;

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,20 @@
1111

1212
/// \file GPUTPCCFCheckPadBaseline.h
1313
/// \author Felix Weiglhofer
14+
///
15+
/// Kernel identifies noisy TPC pads by analyzing charge patterns over time.
16+
/// A pad is marked noisy if it exceeds thresholds for total or consecutive
17+
/// time bins with charge, unless the charge exceeds a saturation threshold.
1418

1519
#ifndef O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H
1620
#define O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H
1721

1822
#include "GPUGeneralKernels.h"
1923
#include "GPUConstantMem.h"
24+
#include "GPUTPCGeometry.h"
2025

2126
#include "clusterFinderDefs.h"
27+
#include "CfArray2D.h"
2228

2329
namespace o2::gpu
2430
{
@@ -28,13 +34,20 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
2834

2935
public:
3036
enum {
31-
PadsPerCacheline = 8,
32-
TimebinsPerCacheline = 4,
33-
NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerCacheline,
37+
PadsPerCacheline = TPCMapMemoryLayout<uint16_t>::Width,
38+
TimebinsPerCacheline = TPCMapMemoryLayout<uint16_t>::Height,
39+
EntriesPerCacheline = PadsPerCacheline * TimebinsPerCacheline,
40+
NumOfCachedPads = GPUCA_WARP_SIZE / TimebinsPerCacheline,
41+
NumCLsPerWarp = GPUCA_WARP_SIZE / EntriesPerCacheline,
42+
NumOfCachedTBs = TimebinsPerCacheline,
43+
// Threads index shared memory as [iThread / MaxNPadsPerRow][iThread % MaxNPadsPerRow].
44+
// Rounding up to a multiple of PadsPerCacheline ensures iThread / MaxNPadsPerRow < NumOfCachedTBs
45+
// for all threads, avoiding out-of-bounds access.
46+
MaxNPadsPerRow = CAMath::nextMultipleOf<PadsPerCacheline>(GPUTPCGeometry::MaxNPadsPerRow()),
3447
};
3548

3649
struct GPUSharedMemory {
37-
tpccf::Charge charges[PadsPerCacheline][NumOfCachedTimebins];
50+
tpccf::Charge charges[NumOfCachedTBs][MaxNPadsPerRow];
3851
};
3952

4053
typedef GPUTPCClusterFinder processorType;
@@ -48,11 +61,28 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
4861
return gpudatatypes::RecoStep::TPCClusterFinding;
4962
}
5063

64+
static int32_t GetNBlocks(bool isGPU)
65+
{
66+
const int32_t nBlocks = TPC_PADS_IN_SECTOR / PadsPerCacheline;
67+
return isGPU ? GPUCA_ROW_COUNT : nBlocks;
68+
}
69+
5170
template <int32_t iKernel = defaultKernel>
5271
GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
5372

5473
private:
55-
GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&);
74+
GPUd() static void CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
75+
GPUd() static void CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer);
76+
77+
template <int32_t PadsPerBlock>
78+
GPUd() static CfChargePos padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder&, int32_t& padsPerRow);
79+
80+
struct RowInfo {
81+
int16_t globalPadOffset;
82+
int16_t nPads;
83+
};
84+
GPUd() static RowInfo GetRowInfo(int16_t row);
85+
5686
GPUd() static void updatePadBaseline(int32_t pad, const GPUTPCClusterFinder&, int32_t totalCharges, int32_t consecCharges, tpccf::Charge maxCharge);
5787
};
5888

0 commit comments

Comments
 (0)