From 5cff2517b11780bc278b56ba68f780ac67dc6de5 Mon Sep 17 00:00:00 2001 From: Philip Wiese Date: Sun, 26 Oct 2025 20:48:54 +0100 Subject: [PATCH 1/4] Improve profiling information --- .../TilingPrototypes.py | 71 +++++++++++++++---- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py index 2f6c1e9590..d0e93c612e 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py @@ -79,14 +79,27 @@ class ProfilingPrototypeMixIn(ABC): _printLoopSetup = NodeTemplate(""" StopTimer(); + printf("===== Profiling ${nodeName} =====\\n"); for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0); ${profileIdxVar} < ${numTiles}[*${tileIdxPtr}]; ${profileIdxVar}++){ """) - _printCycleDifference = NodeTemplate(r""" - printf("%s%u] %s%u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \ - ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + _measurementDeclaration = NodeTemplate(""" + uint32_t ${measurement} = ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}]; + """) + + _printCycleDifference = NodeTemplate(""" + printf("%s%u] %s%6u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \ + ${measurement}, ${suffixStr}); + """) + + _printCycleContribution = NodeTemplate(""" + uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput}; + uint32_t dma = ${measurementInput} + ${measurementOutput}; + float dma_percentage = (total == 0) ? 0 : dma * 100.0f / total; + float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total; + printf("%s%u] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhad, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, dma_percentage, ${measurementKernel}, dma); """) _printLoopTeardown = NodeTemplate(""" @@ -151,13 +164,37 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe "tileIdxPtr": tileIdxPtr, }) + executionBlock.addRight( + cls._measurementDeclaration, { + "measurement": f"{nodeName}_ingress_dma_wait_measurement", + "measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements", + "measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements", + "profileIdxVar": profileIdxVar, + }) + + if metaInfo.kernelLevelTiling: + executionBlock.addRight( + cls._measurementDeclaration, { + "measurement": f"{nodeName}_kernel_measurement", + "measurementsStart": f"{nodeName}_kernel_start_measurements", + "measurementsEnd": f"{nodeName}_kernel_end_measurements", + "profileIdxVar": profileIdxVar, + }) + + executionBlock.addRight( + cls._measurementDeclaration, { + "measurement": f"{nodeName}_egress_dma_wait_measurement", + "measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements", + "measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements", + "profileIdxVar": profileIdxVar, + }) + executionBlock.addRight( cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", - "flavorStr": "Input DMA took ", - "measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements", - "measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements", + "flavorStr": "Pre-Kernel :", + "measurement": f"{nodeName}_ingress_dma_wait_measurement", "profileIdxVar": profileIdxVar, }) @@ -166,9 +203,8 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", - "flavorStr": "Kernel took ", - "measurementsStart": f"{nodeName}_kernel_start_measurements", - "measurementsEnd": f"{nodeName}_kernel_end_measurements", + "flavorStr": "Kernel :", + "measurement": f"{nodeName}_kernel_measurement", "profileIdxVar": profileIdxVar, }) @@ -176,12 +212,23 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", - "flavorStr": "Output DMA took ", - "measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements", - "measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements", + "flavorStr": "Post-Kernel:", + "measurement": f"{nodeName}_egress_dma_wait_measurement", "profileIdxVar": profileIdxVar, }) + # Total Time: Input + Kernel + Output + # Overhead: (Input + Output) / Total + if metaInfo.kernelLevelTiling: + executionBlock.addRight( + cls._printCycleContribution, { + "prefixStr": f"{nodeName}_prefix", + "measurementInput": f"{nodeName}_ingress_dma_wait_measurement", + "measurementKernel": f"{nodeName}_kernel_measurement", + "measurementOutput": f"{nodeName}_egress_dma_wait_measurement", + "profileIdxVar": profileIdxVar, + }) + executionBlock.addRight(cls._printLoopTeardown, {}) return executionBlock From 20c47a808565325f0610bf30f8f7bc6248ce1f68 Mon Sep 17 00:00:00 2001 From: Philip Wiese Date: Fri, 12 Dec 2025 16:21:57 +0100 Subject: [PATCH 2/4] Profile all memory levels --- .../DoubleBufferingTilingCodeGeneration.py | 18 +++++++----------- .../SingleBufferingTilingCodeGeneration.py | 18 +++++++----------- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py index d436d1ccc2..ad9c6ad012 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py @@ -276,17 +276,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA future"})) teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures) - metaInfo = TilingMetaInfo( - nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", - nodeOps = operatorRepresentation['nodeOps'], - numTiles = operatorRepresentation['numTiles'], - totalNumTiles = len(tilingSchedule.outputLoadSchedule), - tileIdxPtr = operatorRepresentation['tileIdxPtr'], - tileIdxVar = "TILING_I", - # TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel. - # The current implementation does this by checking whether we are at the lowest memory level, - # which is hardcoded by the value "L1". Change this to be memory level agnostic. - kernelLevelTiling = self.localMemory == "L1") + metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", + nodeOps = operatorRepresentation['nodeOps'], + numTiles = operatorRepresentation['numTiles'], + totalNumTiles = len(tilingSchedule.outputLoadSchedule), + tileIdxPtr = operatorRepresentation['tileIdxPtr'], + tileIdxVar = "TILING_I", + kernelLevelTiling = True) executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements, openLoopStatements, closeLoopStatements, setupStatements, diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py index fb08a0e818..ea1e938b58 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py @@ -117,17 +117,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})] - metaInfo = TilingMetaInfo( - nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", - nodeOps = operatorRepresentation['nodeOps'], - numTiles = operatorRepresentation['numTiles'], - totalNumTiles = len(tilingSchedule.outputLoadSchedule), - tileIdxPtr = operatorRepresentation['tileIdxPtr'], - tileIdxVar = "TILING_I", - # TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel. - # The current implementation does this by checking whether we are at the lowest memory level, - # which is hardcoded by the value "L1". Change this to be memory level agnostic. - kernelLevelTiling = self.localMemory == "L1") + metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", + nodeOps = operatorRepresentation['nodeOps'], + numTiles = operatorRepresentation['numTiles'], + totalNumTiles = len(tilingSchedule.outputLoadSchedule), + tileIdxPtr = operatorRepresentation['tileIdxPtr'], + tileIdxVar = "TILING_I", + kernelLevelTiling = True) executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements, openLoopStatements, closeLoopStatements, setupStatements, From e0e7abfbca57f41a6504dcfd3b943f20b550394c Mon Sep 17 00:00:00 2001 From: Philip Wiese Date: Fri, 12 Dec 2025 16:39:01 +0100 Subject: [PATCH 3/4] Update Changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e33026a58..4125838100 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ## Unreleased (Planned Release Target: v0.2.1) ### List of Pull Requests +- Improve Profiling [#138](https://github.com/pulp-platform/Deeploy/pull/138) - FP32 ReduceMean operator improvement [#137](https://github.com/pulp-platform/Deeploy/pull/137) - Support for RMSNorm (Pow and Sqrt operators) [#136](https://github.com/pulp-platform/Deeploy/pull/136) - Demo TinyViT compatibility with tiled Siracusa [#124](https://github.com/pulp-platform/Deeploy/pull/124) @@ -81,6 +82,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy` - PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used - annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations +- Calculate non-kernel overhead and show total time spent during profiling ### Changed - Structure of Tests subdir for improved ordering @@ -123,6 +125,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Added missing shape annotation to the testTypeInferenceDifferentTypes - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode - changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection +- Print kernel profiling information for all memory levels ### Fixed - Fixed ReduceMean parallelization and tiling issues described in Issue [#134](https://github.com/pulp-platform/Deeploy/issues/134). From d73f8109080c47d1a34cf0a82a92dd508abbe121 Mon Sep 17 00:00:00 2001 From: Philip Wiese Date: Wed, 24 Dec 2025 01:41:32 +0100 Subject: [PATCH 4/4] Implement PR feedback --- .../CodeTransformationPasses/TilingPrototypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py index d0e93c612e..09a4ef56eb 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py @@ -97,9 +97,9 @@ class ProfilingPrototypeMixIn(ABC): _printCycleContribution = NodeTemplate(""" uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput}; uint32_t dma = ${measurementInput} + ${measurementOutput}; - float dma_percentage = (total == 0) ? 0 : dma * 100.0f / total; + float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total; float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total; - printf("%s%u] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhad, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, dma_percentage, ${measurementKernel}, dma); + printf("%s%u] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, overhead_percentage , ${measurementKernel}, dma); """) _printLoopTeardown = NodeTemplate("""