diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 9a337c02ad26d..33dd5b9e6bb89 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -423,6 +424,17 @@ class GPUReconstruction void debugInit(); void debugExit(); + struct debugWriter { + debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents); + void header(); + void row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport = ""); + + private: + std::ofstream streamCSV; + bool mMarkdown; + uint32_t mStatNEvents; + }; + static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg); }; diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 409c28b8bf328..752b5f27ded3f 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -35,6 +35,7 @@ #include #include +#include #ifndef _WIN32 #include @@ -230,6 +231,7 @@ int32_t GPUReconstructionCPU::RunChains() GPUInfo("Allocated memory when starting processing %34s", ""); PrintMemoryOverview(); } + mTimerTotal.Start(); const std::clock_t cpuTimerStart = std::clock(); int32_t retVal = 0; @@ -264,7 +266,10 @@ int32_t GPUReconstructionCPU::RunChains() double kernelTotal = 0; std::vector kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.); + debugWriter writer(GetProcessingSettings().debugCSV, GetProcessingSettings().debugMarkdown, mStatNEvents); + if (GetProcessingSettings().debugLevel >= 1) { + writer.header(); for (uint32_t i = 0; i < mTimers.size(); i++) { double time = 0; if (mTimers[i] == nullptr) { @@ -284,11 +289,7 @@ int32_t GPUReconstructionCPU::RunChains() int32_t stepNum = getRecoStepNum(mTimers[i]->step); kernelStepTimes[stepNum] += time; } - char bandwidth[256] = ""; - if (mTimers[i]->memSize && mStatNEvents && time != 0.) { - snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); - } - printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); + writer.row('K', mTimers[i]->count, mTimers[i]->name.c_str(), time, -1.0, -1.0, mTimers[i]->memSize); if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; mTimers[i]->memSize = 0; @@ -298,16 +299,13 @@ int32_t GPUReconstructionCPU::RunChains() if (GetProcessingSettings().recoTaskTiming) { for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) { if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { - printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks", - gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime()); + writer.row(' ', 0, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)", kernelStepTimes[i], mTimersRecoSteps[i].timerCPU, mTimersRecoSteps[i].timerTotal.GetElapsedTime(), 0); } if (mTimersRecoSteps[i].bytesToGPU) { - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU); + writer.row('D', mTimersRecoSteps[i].countToGPU, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)", mTimersRecoSteps[i].timerToGPU.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToGPU); } if (mTimersRecoSteps[i].bytesToHost) { - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); + writer.row('D', mTimersRecoSteps[i].countToHost, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)", mTimersRecoSteps[i].timerToHost.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToHost); } if (GetProcessingSettings().resetTimers) { mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0; @@ -321,14 +319,11 @@ int32_t GPUReconstructionCPU::RunChains() } for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) { if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { - printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents); + writer.row(' ', 0, gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime(), -1.0, -1.0, 0); } } - if (GetProcessingSettings().debugLevel >= 1) { - mStatKernelTime = kernelTotal * 1000000 / mStatNEvents; - printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str()); - } - printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str()); + double gpu_time = GetProcessingSettings().debugLevel >= 1 ? kernelTotal : -1.0; + writer.row(' ', 0, "Wall", gpu_time, mStatCPUTime, mTimerTotal.GetElapsedTime(), 0, nEventReport); } else if (GetProcessingSettings().debugLevel >= 0) { GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); } diff --git a/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx index c1c31eedde1b2..3ae30dfe50f91 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx @@ -23,6 +23,8 @@ #include #include #include +#include +#include using namespace o2::gpu; @@ -186,3 +188,107 @@ bool GPUReconstruction::triggerDebugDump() } return false; } + +GPUReconstruction::debugWriter::debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents) : mMarkdown{markdown}, mStatNEvents{statNEvents} +{ + if (!filenameCSV.empty()) { + streamCSV.open(filenameCSV, std::ios::out | std::ios::app); + } +} + +void GPUReconstruction::debugWriter::header() +{ + if (streamCSV.is_open() && !streamCSV.tellp()) { + streamCSV << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; + } + + if (mMarkdown) { + std::cout << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + std::cout << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; + } +} + +void GPUReconstruction::debugWriter::row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport) +{ + double scale = 1000000.0 / mStatNEvents; + + if (streamCSV.is_open()) { + streamCSV << type << ","; + if (count != 0) + streamCSV << count; + streamCSV << "," << name << ","; + if (gpu_time != -1.0) + streamCSV << std::format("{:.0f}", gpu_time * scale); + streamCSV << ","; + if (cpu_time != -1.0) + streamCSV << std::format("{:.0f}", cpu_time * scale); + streamCSV << ","; + if (cpu_time != -1.0 && total_time != -1.0) + streamCSV << std::format("{:.2f}", cpu_time / total_time); + streamCSV << ","; + if (total_time != -1.0) + streamCSV << std::format("{:.0f}", total_time * scale); + streamCSV << ","; + if (memSize != 0 && count != 0) + streamCSV << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + else + streamCSV << ",,"; + streamCSV << std::endl; + } + + if (mMarkdown) { + std::cout << "| " << type << " | "; + if (count != 0) + std::cout << std::format("{:6} |", count); + else + std::cout << " |"; + std::cout << std::format(" {:42}|", name); + if (gpu_time != -1.0) + std::cout << std::format("{:10.0f} |", gpu_time * scale); + else + std::cout << " |"; + if (cpu_time != -1.0) + std::cout << std::format("{:10.0f} |", cpu_time * scale); + else + std::cout << " |"; + if (cpu_time != -1.0 && total_time != -1.0) + std::cout << std::format("{:8.2f} |", cpu_time / total_time); + else + std::cout << " |"; + if (total_time != -1.0) + std::cout << std::format("{:10.0f} |", total_time * scale); + else + std::cout << " |"; + if (memSize != 0 && count != 0) + std::cout << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + else + std::cout << " | | |"; + std::cout << std::endl; + } else { + if (name.substr(0, 3) == "GPU") { + char bandwidth[256] = ""; + if (memSize && mStatNEvents && gpu_time != 0.0) { + snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + } + printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type, count, name.c_str(), gpu_time * scale, bandwidth); + } else if (name.substr(0, 3) == "TPC") { + std::size_t n = name.find('('); + std::string basename = name.substr(0, n - 1); + std::string postfix = name.substr(n + 1, name.size() - n - 2); + if (total_time != -1.0) { + printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", postfix.c_str(), + basename.c_str(), gpu_time * scale, "", total_time * scale, cpu_time * scale, cpu_time / total_time); + } else { + printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", count, postfix.c_str(), basename.c_str(), gpu_time * scale, + memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + } + } else if (name == "Prepare") { + printf("Execution Time: General Step : %50s Time: %'10.0f us\n", name.c_str(), gpu_time * scale); + } else if (name == "Wall") { + if (gpu_time != -1.0) { + printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", gpu_time * scale, nEventReport.c_str()); + } + printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", total_time * scale, cpu_time * scale, cpu_time / total_time, nEventReport.c_str()); + } + } +} diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 57cb1371a4aa0..606deb44d9528 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -329,6 +329,8 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file") AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") +AddOption(debugCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.") +AddOption(debugMarkdown, bool, false, "", 0, "Print the results of standlaone benchmarks in markdown format") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index a2e74c45fcb86..2a2f7adea8cb9 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -627,7 +627,9 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU if (configStandalone.runs > 1) { printf("Run %d (thread %d)\n", iteration + 1, threadId); } - recUse->SetResetTimers(iRun < configStandalone.runsInit); + if (configStandalone.runsInit > 0 && configStandalone.proc.debugCSV.empty()) { + recUse->SetResetTimers(iRun < configStandalone.runsInit); + } if (configStandalone.outputcontrolmem) { recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem); } @@ -685,7 +687,9 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0; } chainTrackingAsync->mIOPtrs.clustersNative = nullptr; - recAsync->SetResetTimers(iRun < configStandalone.runsInit); + if (configStandalone.runsInit > 0 && configStandalone.proc.debugCSV.empty()) { + recAsync->SetResetTimers(iRun < configStandalone.runsInit); + } tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingAsync, nullptr, nullptr);