@@ -267,10 +267,10 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t
267267 }
268268
269269 bool hasHIPTrigger = false ;
270-
271270 if (hipFilterOn) {
272271 hasHIPTrigger = work_group_any (thisThreadHasTrigger);
273272 } else {
273+ // Need a barrier here even if HIP filter is disabled
274274 GPUbarrier ();
275275 }
276276
@@ -458,6 +458,21 @@ GPUd() void GPUTPCCFHIPTailConnector::Thread<0>(int32_t nBlocks, int32_t nThread
458458 // HIP TAILS: indexing starts at 1, so 0 index indicates no connection
459459 HIPTailDescriptor* tails = GetHIPTails (clusterer, row);
460460
461+ #ifdef GPUCA_DETERMINISTIC_MODE
462+ // Races in tail comparisons and atomic swap can lead to slightly different clusters.
463+ // So need a sequential fallback for deterministic mode
464+ if (iThread > 0 ) {
465+ return ;
466+ }
467+ nThreads = 1 ;
468+ GPUCommonAlgorithm::sortInBlock (tails + 1 , tails + nTails + 1 , [](auto &&t1, auto &&t2) {
469+ if (t1.pad != t2.pad ) {
470+ return t1.pad < t2.pad ;
471+ }
472+ return t1.tailStart < t2.tailStart ;
473+ });
474+ #endif
475+
461476 for (uint32_t iTail = iThread + 1 ; iTail <= nTails; iTail += nThreads) {
462477 auto * tail = &tails[iTail];
463478
@@ -543,7 +558,7 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
543558 float padSigma = CAMath::Sqrt (CAMath::Max (0 .f , padSqSum / weightSum - padMean * padMean));
544559 float timeSigma = CAMath::Sqrt (CAMath::Max (0 .f , timeSqSum / weightSum - timeMean * timeMean));
545560
546- o2:: tpc::ClusterNative cn;
561+ tpc::ClusterNative cn;
547562 cn.qMax = qMax;
548563 cn.qTot = (uint16_t )CAMath::Min (qTot, 65535 .f );
549564 float clusterTime = fragment.start + timeMean - clusterer.Param ().rec .tpc .clustersShiftTimebinsClusterizer ;
0 commit comments