diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e33026a58..4125838100 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ## Unreleased (Planned Release Target: v0.2.1) ### List of Pull Requests +- Improve Profiling [#138](https://github.com/pulp-platform/Deeploy/pull/138) - FP32 ReduceMean operator improvement [#137](https://github.com/pulp-platform/Deeploy/pull/137) - Support for RMSNorm (Pow and Sqrt operators) [#136](https://github.com/pulp-platform/Deeploy/pull/136) - Demo TinyViT compatibility with tiled Siracusa [#124](https://github.com/pulp-platform/Deeploy/pull/124) @@ -81,6 +82,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy` - PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used - annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations +- Calculate non-kernel overhead and show total time spent during profiling ### Changed - Structure of Tests subdir for improved ordering @@ -123,6 +125,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Added missing shape annotation to the testTypeInferenceDifferentTypes - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode - changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection +- Print kernel profiling information for all memory levels ### Fixed - Fixed ReduceMean parallelization and tiling issues described in Issue [#134](https://github.com/pulp-platform/Deeploy/issues/134). diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py index d436d1ccc2..ad9c6ad012 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py @@ -276,17 +276,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA future"})) teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures) - metaInfo = TilingMetaInfo( - nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", - nodeOps = operatorRepresentation['nodeOps'], - numTiles = operatorRepresentation['numTiles'], - totalNumTiles = len(tilingSchedule.outputLoadSchedule), - tileIdxPtr = operatorRepresentation['tileIdxPtr'], - tileIdxVar = "TILING_I", - # TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel. - # The current implementation does this by checking whether we are at the lowest memory level, - # which is hardcoded by the value "L1". Change this to be memory level agnostic. - kernelLevelTiling = self.localMemory == "L1") + metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", + nodeOps = operatorRepresentation['nodeOps'], + numTiles = operatorRepresentation['numTiles'], + totalNumTiles = len(tilingSchedule.outputLoadSchedule), + tileIdxPtr = operatorRepresentation['tileIdxPtr'], + tileIdxVar = "TILING_I", + kernelLevelTiling = True) executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements, openLoopStatements, closeLoopStatements, setupStatements, diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py index fb08a0e818..ea1e938b58 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py @@ -117,17 +117,13 @@ def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})] - metaInfo = TilingMetaInfo( - nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", - nodeOps = operatorRepresentation['nodeOps'], - numTiles = operatorRepresentation['numTiles'], - totalNumTiles = len(tilingSchedule.outputLoadSchedule), - tileIdxPtr = operatorRepresentation['tileIdxPtr'], - tileIdxVar = "TILING_I", - # TODO: The kernelLevelTiling field is used in profiling to know we are generating code around the kernel. - # The current implementation does this by checking whether we are at the lowest memory level, - # which is hardcoded by the value "L1". Change this to be memory level agnostic. - kernelLevelTiling = self.localMemory == "L1") + metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}", + nodeOps = operatorRepresentation['nodeOps'], + numTiles = operatorRepresentation['numTiles'], + totalNumTiles = len(tilingSchedule.outputLoadSchedule), + tileIdxPtr = operatorRepresentation['tileIdxPtr'], + tileIdxVar = "TILING_I", + kernelLevelTiling = True) executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements, openLoopStatements, closeLoopStatements, setupStatements, diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py index 2f6c1e9590..09a4ef56eb 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py @@ -79,14 +79,27 @@ class ProfilingPrototypeMixIn(ABC): _printLoopSetup = NodeTemplate(""" StopTimer(); + printf("===== Profiling ${nodeName} =====\\n"); for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0); ${profileIdxVar} < ${numTiles}[*${tileIdxPtr}]; ${profileIdxVar}++){ """) - _printCycleDifference = NodeTemplate(r""" - printf("%s%u] %s%u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \ - ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + _measurementDeclaration = NodeTemplate(""" + uint32_t ${measurement} = ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}]; + """) + + _printCycleDifference = NodeTemplate(""" + printf("%s%u] %s%6u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \ + ${measurement}, ${suffixStr}); + """) + + _printCycleContribution = NodeTemplate(""" + uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput}; + uint32_t dma = ${measurementInput} + ${measurementOutput}; + float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total; + float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total; + printf("%s%u] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, overhead_percentage , ${measurementKernel}, dma); """) _printLoopTeardown = NodeTemplate(""" @@ -151,13 +164,37 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe "tileIdxPtr": tileIdxPtr, }) + executionBlock.addRight( + cls._measurementDeclaration, { + "measurement": f"{nodeName}_ingress_dma_wait_measurement", + "measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements", + "measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements", + "profileIdxVar": profileIdxVar, + }) + + if metaInfo.kernelLevelTiling: + executionBlock.addRight( + cls._measurementDeclaration, { + "measurement": f"{nodeName}_kernel_measurement", + "measurementsStart": f"{nodeName}_kernel_start_measurements", + "measurementsEnd": f"{nodeName}_kernel_end_measurements", + "profileIdxVar": profileIdxVar, + }) + + executionBlock.addRight( + cls._measurementDeclaration, { + "measurement": f"{nodeName}_egress_dma_wait_measurement", + "measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements", + "measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements", + "profileIdxVar": profileIdxVar, + }) + executionBlock.addRight( cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", - "flavorStr": "Input DMA took ", - "measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements", - "measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements", + "flavorStr": "Pre-Kernel :", + "measurement": f"{nodeName}_ingress_dma_wait_measurement", "profileIdxVar": profileIdxVar, }) @@ -166,9 +203,8 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", - "flavorStr": "Kernel took ", - "measurementsStart": f"{nodeName}_kernel_start_measurements", - "measurementsEnd": f"{nodeName}_kernel_end_measurements", + "flavorStr": "Kernel :", + "measurement": f"{nodeName}_kernel_measurement", "profileIdxVar": profileIdxVar, }) @@ -176,12 +212,23 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe cls._printCycleDifference, { "prefixStr": f"{nodeName}_prefix", "suffixStr": f"{nodeName}_suffix", - "flavorStr": "Output DMA took ", - "measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements", - "measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements", + "flavorStr": "Post-Kernel:", + "measurement": f"{nodeName}_egress_dma_wait_measurement", "profileIdxVar": profileIdxVar, }) + # Total Time: Input + Kernel + Output + # Overhead: (Input + Output) / Total + if metaInfo.kernelLevelTiling: + executionBlock.addRight( + cls._printCycleContribution, { + "prefixStr": f"{nodeName}_prefix", + "measurementInput": f"{nodeName}_ingress_dma_wait_measurement", + "measurementKernel": f"{nodeName}_kernel_measurement", + "measurementOutput": f"{nodeName}_egress_dma_wait_measurement", + "profileIdxVar": profileIdxVar, + }) + executionBlock.addRight(cls._printLoopTeardown, {}) return executionBlock