3939// O2 track model
4040#include " ReconstructionDataFormats/Track.h"
4141#include " DetectorsBase/Propagator.h"
42+ #include " utils/strtag.h"
4243using namespace o2 ::track;
4344
4445namespace o2 ::its
@@ -1106,11 +1107,19 @@ void processNeighboursHandler(const int startLayer,
11061107 const int nBlocks,
11071108 const int nThreads)
11081109{
1110+ constexpr uint64_t Tag = qStr2Tag (" ITS_PNH1" );
1111+
1112+ // allocators used
11091113 auto allocInt = gpu::TypedAllocator<int >(alloc);
11101114 auto allocCellSeed = gpu::TypedAllocator<CellSeed<nLayers>>(alloc);
1111- thrust::device_vector<int , gpu::TypedAllocator<int >> foundSeedsTable (nCells[startLayer] + 1 , 0 , allocInt);
1112- auto nosync_policy = THRUST_NAMESPACE::par_nosync (gpu::TypedAllocator<char >(alloc)).on (gpu::Stream::DefaultStream);
1115+ // use sync_policy, this part cannot be run async but tell thrust to use the allocator
1116+ auto sync_policy = THRUST_NAMESPACE::par (gpu::TypedAllocator<char >(alloc));
1117+
1118+ // put initial computation on Tag1
1119+ alloc->pushTagOnStack (Tag);
11131120
1121+ // start processing of cells
1122+ thrust::device_vector<int , gpu::TypedAllocator<int >> foundSeedsTable (nCells[startLayer] + 1 , 0 , allocInt);
11141123 gpu::processNeighboursKernel<true , nLayers><<<nBlocks, nThreads>>> (
11151124 startLayer,
11161125 startLevel,
@@ -1129,10 +1138,10 @@ void processNeighboursHandler(const int startLayer,
11291138 maxChi2ClusterAttachment,
11301139 propagator,
11311140 matCorrType);
1132- thrust::exclusive_scan (nosync_policy , foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1133-
1134- thrust::device_vector<int , gpu::TypedAllocator<int >> updatedCellId (foundSeedsTable. back () , 0 , allocInt);
1135- thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed (foundSeedsTable. back () , allocCellSeed);
1141+ thrust::exclusive_scan (sync_policy , foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1142+ auto foundSeeds{foundSeedsTable. back ()};
1143+ thrust::device_vector<int , gpu::TypedAllocator<int >> updatedCellId (foundSeeds , 0 , allocInt);
1144+ thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> updatedCellSeed (foundSeeds , allocCellSeed);
11361145 gpu::processNeighboursKernel<false , nLayers><<<nBlocks, nThreads>>> (
11371146 startLayer,
11381147 startLevel,
@@ -1151,20 +1160,41 @@ void processNeighboursHandler(const int startLayer,
11511160 maxChi2ClusterAttachment,
11521161 propagator,
11531162 matCorrType);
1154- GPUChkErrS (cudaStreamSynchronize (gpu::Stream::DefaultStream));
11551163
1164+ // now do inward steps until stop is reached
11561165 int level = startLevel;
1157- thrust::device_vector<int , gpu::TypedAllocator<int >> lastCellId (allocInt);
1158- thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed (allocCellSeed);
1166+
1167+ // Host buffers to break dependency
1168+ // FIXME: these should be on our memory resource!
1169+ std::vector<int > hostCellId;
1170+ std::vector<CellSeed<nLayers>> hostCellSeed;
1171+
1172+ // inward loop
11591173 for (int iLayer{startLayer - 1 }; iLayer > 0 && level > 2 ; --iLayer) {
1160- lastCellSeed.swap (updatedCellSeed);
1161- lastCellId.swap (updatedCellId);
1174+ // copy current results to host
1175+ hostCellId.resize (updatedCellId.size ());
1176+ hostCellSeed.resize (updatedCellSeed.size ());
1177+ thrust::copy (updatedCellId.begin (), updatedCellId.end (), hostCellId.begin ());
1178+ thrust::copy (updatedCellSeed.begin (), updatedCellSeed.end (), hostCellSeed.begin ());
1179+
1180+ auto lastCellSeedSize{hostCellSeed.size ()};
1181+ // but before we clear the memory, and immediately start a new block
1182+ alloc->popTagOffStack (Tag);
1183+ alloc->pushTagOnStack (Tag);
1184+
1185+ // based on the previous step's result create new LUT and zero it
1186+ thrust::device_vector<int , gpu::TypedAllocator<int >>(allocInt).swap (foundSeedsTable);
1187+ foundSeedsTable.resize (lastCellSeedSize + 1 );
1188+ thrust::fill (sync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), 0 );
1189+
1190+ // recreate lastCell vectors from host
1191+ thrust::device_vector<int , gpu::TypedAllocator<int >> lastCellId (hostCellId.begin (), hostCellId.end (), allocInt);
1192+ thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> lastCellSeed (hostCellSeed.begin (), hostCellSeed.end (), allocCellSeed);
1193+ // also create new vectors on new block
11621194 thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>>(allocCellSeed).swap (updatedCellSeed);
11631195 thrust::device_vector<int , gpu::TypedAllocator<int >>(allocInt).swap (updatedCellId);
1164- auto lastCellSeedSize{lastCellSeed.size ()};
1165- foundSeedsTable.resize (lastCellSeedSize + 1 );
1166- thrust::fill (nosync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), 0 );
11671196
1197+ // start step
11681198 gpu::processNeighboursKernel<true , nLayers><<<nBlocks, nThreads>>> (
11691199 iLayer,
11701200 --level,
@@ -1183,14 +1213,13 @@ void processNeighboursHandler(const int startLayer,
11831213 maxChi2ClusterAttachment,
11841214 propagator,
11851215 matCorrType);
1186- thrust::exclusive_scan (nosync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1187-
1188- auto foundSeeds{foundSeedsTable.back ()};
1216+ // how many new seeds where found
1217+ thrust::exclusive_scan (sync_policy, foundSeedsTable.begin (), foundSeedsTable.end (), foundSeedsTable.begin ());
1218+ foundSeeds = foundSeedsTable.back ();
1219+ // do a resize, we don't need to set the memory now since we know that all of these are written to
1220+ // Note though this does not clear the memory...
11891221 updatedCellId.resize (foundSeeds);
1190- thrust::fill (nosync_policy, updatedCellId.begin (), updatedCellId.end (), 0 );
11911222 updatedCellSeed.resize (foundSeeds);
1192- thrust::fill (nosync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), CellSeed<nLayers>());
1193-
11941223 gpu::processNeighboursKernel<false , nLayers><<<nBlocks, nThreads>>> (
11951224 iLayer,
11961225 level,
@@ -1210,12 +1239,15 @@ void processNeighboursHandler(const int startLayer,
12101239 propagator,
12111240 matCorrType);
12121241 }
1213- GPUChkErrS (cudaStreamSynchronize (gpu::Stream::DefaultStream));
1214- thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds (updatedCellSeed.size (), allocCellSeed);
1215- auto end = thrust::copy_if (nosync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), outSeeds.begin (), gpu::seed_selector<nLayers>(1 .e3 , maxChi2NDF * ((startLevel + 2 ) * 2 - 5 )));
1216- auto s{end - outSeeds.begin ()};
1217- seedsHost.reserve (seedsHost.size () + s);
1218- thrust::copy (outSeeds.begin (), outSeeds.begin () + s, std::back_inserter (seedsHost));
1242+
1243+ // final copy of result
1244+ const auto selector = gpu::seed_selector<nLayers>(1 .e3 , maxChi2NDF * ((startLevel + 2 ) * 2 - 5 ));
1245+ const auto count = thrust::count_if (sync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), selector);
1246+ thrust::device_vector<CellSeed<nLayers>, gpu::TypedAllocator<CellSeed<nLayers>>> outSeeds (count, allocCellSeed);
1247+ thrust::copy_if (sync_policy, updatedCellSeed.begin (), updatedCellSeed.end (), outSeeds.begin (), selector);
1248+ seedsHost.reserve (seedsHost.size () + count);
1249+ thrust::copy (outSeeds.begin (), outSeeds.end (), std::back_inserter (seedsHost));
1250+ alloc->popTagOffStack (Tag);
12191251}
12201252
12211253template <int nLayers>
0 commit comments