Skip to content

Commit db68a6e

Browse files
committed
ITS: GPU: more memory clearing in processNeighbours
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent 64fe415 commit db68a6e

File tree

1 file changed

+58
-26
lines changed

1 file changed

+58
-26
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 58 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
// O2 track model
4040
#include "ReconstructionDataFormats/Track.h"
4141
#include "DetectorsBase/Propagator.h"
42+
#include "utils/strtag.h"
4243
using namespace o2::track;
4344

4445
namespace o2::its
@@ -1106,11 +1107,19 @@ void processNeighboursHandler(const int startLayer,
11061107
const int nBlocks,
11071108
const int nThreads)
11081109
{
1110+
constexpr uint64_t Tag = qStr2Tag("ITS_PNH1");
1111+
1112+
// allocators used
11091113
auto allocInt = gpu::TypedAllocator<int>(alloc);
11101114
auto allocCellSeed = gpu::TypedAllocator<CellSeed<nLayers>>(alloc);
1111-
thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt);
1112-
auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator<char>(alloc)).on(gpu::Stream::DefaultStream);
1115+
// use sync_policy, this part cannot be run async but tell thrust to use the allocator
1116+
auto sync_policy = THRUST_NAMESPACE::par(gpu::TypedAllocator<char>(alloc));
1117+
1118+
// put initial computation on Tag1
1119+
alloc->pushTagOnStack(Tag);
11131120

1121+
// start processing of cells
1122+
thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt);
11141123
gpu::processNeighboursKernel<true, nLayers><<<nBlocks, nThreads>>>(
11151124
startLayer,
11161125
startLevel,
@@ -1129,10 +1138,10 @@ void processNeighboursHandler(const int startLayer,
11291138
maxChi2ClusterAttachment,
11301139
propagator,
11311140
matCorrType);
1132-
thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1133-
1134-
thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
1135-
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
1141+
thrust::exclusive_scan(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1142+
auto foundSeeds{foundSeedsTable.back()};
1143+
thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeeds, 0, allocInt);
1144+
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed(foundSeeds, allocCellSeed);
11361145
gpu::processNeighboursKernel<false, nLayers><<<nBlocks, nThreads>>>(
11371146
startLayer,
11381147
startLevel,
@@ -1151,20 +1160,41 @@ void processNeighboursHandler(const int startLayer,
11511160
maxChi2ClusterAttachment,
11521161
propagator,
11531162
matCorrType);
1154-
GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream));
11551163

1164+
// now do inward steps until stop is reached
11561165
int level = startLevel;
1157-
thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt);
1158-
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed(allocCellSeed);
1166+
1167+
// Host buffers to break dependency
1168+
// FIXME: these should be on our memory resource!
1169+
std::vector<int> hostCellId;
1170+
std::vector<CellSeed<nLayers>> hostCellSeed;
1171+
1172+
// inward loop
11591173
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
1160-
lastCellSeed.swap(updatedCellSeed);
1161-
lastCellId.swap(updatedCellId);
1174+
// copy current results to host
1175+
hostCellId.resize(updatedCellId.size());
1176+
hostCellSeed.resize(updatedCellSeed.size());
1177+
thrust::copy(updatedCellId.begin(), updatedCellId.end(), hostCellId.begin());
1178+
thrust::copy(updatedCellSeed.begin(), updatedCellSeed.end(), hostCellSeed.begin());
1179+
1180+
auto lastCellSeedSize{hostCellSeed.size()};
1181+
// but before we clear the memory, and immediately start a new block
1182+
alloc->popTagOffStack(Tag);
1183+
alloc->pushTagOnStack(Tag);
1184+
1185+
// based on the previous step's result create new LUT and zero it
1186+
thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(foundSeedsTable);
1187+
foundSeedsTable.resize(lastCellSeedSize + 1);
1188+
thrust::fill(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0);
1189+
1190+
// recreate lastCell vectors from host
1191+
thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(hostCellId.begin(), hostCellId.end(), allocInt);
1192+
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed(hostCellSeed.begin(), hostCellSeed.end(), allocCellSeed);
1193+
// also create new vectors on new block
11621194
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>>(allocCellSeed).swap(updatedCellSeed);
11631195
thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(updatedCellId);
1164-
auto lastCellSeedSize{lastCellSeed.size()};
1165-
foundSeedsTable.resize(lastCellSeedSize + 1);
1166-
thrust::fill(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0);
11671196

1197+
// start step
11681198
gpu::processNeighboursKernel<true, nLayers><<<nBlocks, nThreads>>>(
11691199
iLayer,
11701200
--level,
@@ -1183,14 +1213,13 @@ void processNeighboursHandler(const int startLayer,
11831213
maxChi2ClusterAttachment,
11841214
propagator,
11851215
matCorrType);
1186-
thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1187-
1188-
auto foundSeeds{foundSeedsTable.back()};
1216+
// how many new seeds where found
1217+
thrust::exclusive_scan(sync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin());
1218+
foundSeeds = foundSeedsTable.back();
1219+
// do a resize, we don't need to set the memory now since we know that all of these are written to
1220+
// Note though this does not clear the memory...
11891221
updatedCellId.resize(foundSeeds);
1190-
thrust::fill(nosync_policy, updatedCellId.begin(), updatedCellId.end(), 0);
11911222
updatedCellSeed.resize(foundSeeds);
1192-
thrust::fill(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed<nLayers>());
1193-
11941223
gpu::processNeighboursKernel<false, nLayers><<<nBlocks, nThreads>>>(
11951224
iLayer,
11961225
level,
@@ -1210,12 +1239,15 @@ void processNeighboursHandler(const int startLayer,
12101239
propagator,
12111240
matCorrType);
12121241
}
1213-
GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream));
1214-
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds(updatedCellSeed.size(), allocCellSeed);
1215-
auto end = thrust::copy_if(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector<nLayers>(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
1216-
auto s{end - outSeeds.begin()};
1217-
seedsHost.reserve(seedsHost.size() + s);
1218-
thrust::copy(outSeeds.begin(), outSeeds.begin() + s, std::back_inserter(seedsHost));
1242+
1243+
// final copy of result
1244+
const auto selector = gpu::seed_selector<nLayers>(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5));
1245+
const auto count = thrust::count_if(sync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), selector);
1246+
thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds(count, allocCellSeed);
1247+
thrust::copy_if(sync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), selector);
1248+
seedsHost.reserve(seedsHost.size() + count);
1249+
thrust::copy(outSeeds.begin(), outSeeds.end(), std::back_inserter(seedsHost));
1250+
alloc->popTagOffStack(Tag);
12191251
}
12201252

12211253
template <int nLayers>

0 commit comments

Comments
 (0)