diff --git a/.github/workflows/ci-platform-snitch-tiled.yml b/.github/workflows/ci-platform-snitch-tiled.yml index 3850ce2bde..4ebb9aba9a 100644 --- a/.github/workflows/ci-platform-snitch-tiled.yml +++ b/.github/workflows/ci-platform-snitch-tiled.yml @@ -41,6 +41,11 @@ jobs: {"name":"Kernels/Integer/Softmax/Large","L1":[5000,10000]}, {"name":"Kernels/FP32/Softmax/Regular","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/RMSNorm_fused","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/MatMul","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/Add/Regular","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/Hardswish","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/Div","L1":[2000,5000,10000]}, {"name":"Kernels/FP32/GEMM/Regular","L1":[2000,5000,10000]}, {"name":"Kernels/FP32/GEMM/TransB","L1":[2000,5000,10000]}, diff --git a/.github/workflows/ci-platform-snitch.yml b/.github/workflows/ci-platform-snitch.yml index 21f436b2a6..f3a1f8722f 100644 --- a/.github/workflows/ci-platform-snitch.yml +++ b/.github/workflows/ci-platform-snitch.yml @@ -37,6 +37,11 @@ jobs: docker-image: ${{ needs.select-env.outputs.image }} test-names: | Kernels/FP32/Softmax/Regular + Kernels/FP32/RMSNorm_fused + Kernels/FP32/MatMul + Kernels/FP32/Add/Regular + Kernels/FP32/Hardswish + Kernels/FP32/Div Kernels/Integer/Add/Large Kernels/Integer/Add/Regular diff --git a/CMakeLists.txt b/CMakeLists.txt index 70dec13084..e675a648cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,8 +19,8 @@ if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch, Snitch_tiled)") +set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch Snitch_tiled) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -36,6 +36,8 @@ elseif(platform STREQUAL Generic) message(STATUS "Building for platform 'Generic'") elseif(platform STREQUAL Snitch) message(STATUS "Building for platform 'Snitch'") +elseif(platform STREQUAL Snitch_tiled) + message(STATUS "Building for platform 'Snitch_tiled'") elseif(platform STREQUAL SoftHier) message(STATUS "Building for platform 'SoftHier'") elseif(platform STREQUAL Chimera) @@ -211,7 +213,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor endif() -if(platform STREQUAL Snitch) +if(platform STREQUAL Snitch OR platform STREQUAL Snitch_tiled) if(TOOLCHAIN STREQUAL LLVM) set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/snitch/toolchain_llvm.cmake) diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index ec2ed6270f..221a797dab 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -283,6 +283,9 @@ BasicConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, BasicTransformer) ] BasicQuantBindings = [ diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..26dd5746c9 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -709,3 +709,31 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class RMSNormLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + # RMSNorm: square, mean, sqrt, div, mul + size = self.mapper.parser.operatorRepresentation['size'] + lastDimLength = self.mapper.parser.operatorRepresentation['lastDimLength'] + batch_size = size // lastDimLength + + # square + sum + mean + eps + sqrt + div + mul + ops = size + batch_size * lastDimLength + batch_size * 4 + size * 2 + return ops + + +class HardSwishLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + # HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + # Operations: div + add + clip + mul + size = self.mapper.parser.operatorRepresentation['size'] + return size * 4 diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index cf1ba776bd..f0abefd4f6 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -467,23 +467,62 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) - return ret def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: - data_in_1 = ctxt.lookup(node.inputs[0].name) data_in_2 = ctxt.lookup(node.inputs[1].name) data_out = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['data_in_1'] = data_in_1.name self.operatorRepresentation['data_in_2'] = data_in_2.name self.operatorRepresentation['data_out'] = data_out.name - self.operatorRepresentation['size'] = np.prod(data_in_1.shape) + self.operatorRepresentation['size'] = np.prod(data_out.shape) + + # Check if broadcasting is needed + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + need_broadcast = (shape1 != out_shape) or (shape2 != out_shape) + self.operatorRepresentation['need_broadcast'] = need_broadcast + + if need_broadcast: + # Calculate strides for broadcasting + ndim = len(out_shape) + + # Compute strides for input 1 + strides1 = [1] * ndim + for i in range(ndim - 1, -1, -1): + if i < len(shape1) and shape1[i] == out_shape[i]: + if i == ndim - 1: + strides1[i] = 1 + else: + strides1[i] = strides1[i + 1] * shape1[i + 1] if ( + i + 1 < len(shape1) and shape1[i + 1] == out_shape[i + 1]) else strides1[i + 1] + else: + strides1[i] = 0 # Broadcast dimension + + # Compute strides for input 2 + strides2 = [1] * ndim + for i in range(ndim - 1, -1, -1): + if i < len(shape2) and shape2[i] == out_shape[i]: + if i == ndim - 1: + strides2[i] = 1 + else: + strides2[i] = strides2[i + 1] * shape2[i + 1] if ( + i + 1 < len(shape2) and shape2[i + 1] == out_shape[i + 1]) else strides2[i + 1] + else: + strides2[i] = 0 # Broadcast dimension + + self.operatorRepresentation['ndim'] = ndim + self.operatorRepresentation['strides1'] = strides1 + self.operatorRepresentation['strides2'] = strides2 + self.operatorRepresentation['out_shape'] = out_shape return ctxt, True diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..6b3ff546b3 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -102,6 +102,20 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] +class FloatAddChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + + class GatherChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): @@ -610,3 +624,40 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + + +class RMSNormChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + # RMSNorm: square, mean, sqrt, reciprocal, multiply + # Output precision similar to input + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + # RMSNorm output can be signed (depending on input signedness) + if inputs[0]._signed: + return [True] + else: + return [False] + + +class HardSwishChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + if inputs[0]._signed: + return [True] + else: + return [False] diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py index 25b150b553..e442f63038 100644 --- a/Deeploy/Targets/Snitch/Bindings.py +++ b/Deeploy/Targets/Snitch/Bindings.py @@ -11,12 +11,20 @@ from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import iNoNormTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker +from Deeploy.Targets.Generic.Templates import ConcatTemplate, iNoNormTemplate +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, DivChecker, GatherChecker, GEMMChecker, \ + HardSwishChecker, MatMulChecker, MulChecker, ReshapeChecker, RMSNormChecker, RQAddChecker, SoftmaxChecker, \ + TransposeChecker, iNoNormChecker from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \ SnitchSynchCoresPass from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma -from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, FloatMatMulTemplate, GatherTemplate, \ + MatMulTemplate, ReshapeTemplate, RQAddTemplate, TransposeTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates.FloatAddTemplate import referenceTemplate as FloatAddTemplate +from Deeploy.Targets.Snitch.Templates.FloatDivTemplate import referenceTemplate as FloatDivTemplate +from Deeploy.Targets.Snitch.Templates.FloatHardSwishTemplate import referenceTemplate as FloatHardSwishTemplate +from Deeploy.Targets.Snitch.Templates.FloatMulTemplate import referenceTemplate as FloatMulTemplate +from Deeploy.Targets.Snitch.Templates.FloatRMSNormTemplate import referenceTemplate as FloatRMSNormTemplate from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template @@ -45,6 +53,7 @@ ArgumentStructGeneration(), MemoryManagementGeneration("L1"), MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), + MemoryManagementGeneration("L2"), MemoryManagementGeneration() ]) @@ -69,7 +78,18 @@ SnitchAddBindings = [ NodeBinding(AddChecker([PointerClass(_type), PointerClass(_type)], [PointerClass(int32_t)]), AddTemplate.referenceTemplate, TiledTransformer) for _type in [int8_t] +] + [ + # fp32 support + NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAddTemplate, TiledTransformer) +] + +# Basic (non-tiled) FP32 Add Bindings +BasicAddBindings = [ + NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAddTemplate, BasicTransformer) ] + SnitchGemmBindings = [ NodeBinding( GEMMChecker([PointerClass(int8_t), PointerClass(int8_t), @@ -90,3 +110,99 @@ PointerClass(int32_t) ], [PointerClass(int8_t)]), SnitchRqGemm_Template, TiledTransformer) ] + +# RMSNorm Bindings (Tiled) +SnitchRMSNormBindings = [ + NodeBinding(RMSNormChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatRMSNormTemplate, TiledTransformer) +] + +# RMSNorm Bindings (Non-tiled) +BasicRMSNormBindings = [ + NodeBinding(RMSNormChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatRMSNormTemplate, BasicTransformer) +] + +# HardSwish Bindings (Tiled) +SnitchHardSwishBindings = [ + NodeBinding(HardSwishChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatHardSwishTemplate, + TiledTransformer) +] + +# HardSwish Bindings (Non-tiled) +BasicHardSwishBindings = [ + NodeBinding(HardSwishChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatHardSwishTemplate, + BasicTransformer) +] + +# Div Bindings (Tiled) +SnitchDivBindings = [ + NodeBinding(DivChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatDivTemplate, TiledTransformer) +] + +# Div Bindings (Non-tiled) +BasicDivBindings = [ + NodeBinding(DivChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatDivTemplate, BasicTransformer) +] + +# Mul Bindings (Tiled) +SnitchMulBindings = [ + NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMulTemplate, TiledTransformer) +] + +# Mul Bindings (Non-tiled) +BasicMulBindings = [ + NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMulTemplate, BasicTransformer) +] + +# MatMul Bindings (Tiled) +SnitchMatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + MatMulTemplate.referenceTemplate, TiledTransformer), + NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMatMulTemplate.referenceTemplate, TiledTransformer) +] + +# Concat Bindings (Tiled) +SnitchConcatBindings = [ + NodeBinding(ConcatChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int8_t)]), + ConcatTemplate.referenceTemplate, TiledTransformer), + NodeBinding(ConcatChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]), + ConcatTemplate.referenceTemplate, TiledTransformer), + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, TiledTransformer) +] + +# Transpose Bindings (Tiled) +SnitchTransposeBindings = [ + NodeBinding(TransposeChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), TransposeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(TransposeChecker([PointerClass(int32_t)], [PointerClass(int32_t)]), TransposeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(TransposeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + TransposeTemplate.referenceTemplate, TiledTransformer) +] + +# Reshape Bindings (Tiled) +SnitchReshapeBindings = [ + NodeBinding(ReshapeChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), ReshapeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(ReshapeChecker([PointerClass(int32_t)], [PointerClass(int32_t)]), ReshapeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(ReshapeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), ReshapeTemplate.referenceTemplate, + TiledTransformer) +] + +# Gather Bindings (Tiled) +SnitchGatherBindings = [ + NodeBinding(GatherChecker([PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int8_t)]), + GatherTemplate.referenceTemplate, TiledTransformer), + NodeBinding(GatherChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]), + GatherTemplate.referenceTemplate, TiledTransformer), + NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(int32_t)], [PointerClass(float32_t)]), + GatherTemplate.referenceTemplate, TiledTransformer) +] diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py index e8204f6ae2..a3e10ed188 100644 --- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py +++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py @@ -23,15 +23,31 @@ class SnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration): class ProfilingSnitchClusterTilingSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn): _printCycleDifference = NodeTemplate(r""" - printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ - ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + printf("%s%u][Core %d] %s%6u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ + ${measurement}, ${suffixStr}); + """) + + _printCycleContribution = NodeTemplate(r""" + uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput}; + uint32_t dma = ${measurementInput} + ${measurementOutput}; + float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total; + float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total; + printf("%s%u][Core %d] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\n", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), total, kernel_percentage, overhead_percentage, ${measurementKernel}, dma); """) class ProfilingSnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn): _printCycleDifference = NodeTemplate(r""" - printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ - ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + printf("%s%u][Core %d] %s%6u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ + ${measurement}, ${suffixStr}); + """) + + _printCycleContribution = NodeTemplate(r""" + uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput}; + uint32_t dma = ${measurementInput} + ${measurementOutput}; + float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total; + float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total; + printf("%s%u][Core %d] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\n", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), total, kernel_percentage, overhead_percentage, ${measurementKernel}, dma); """) diff --git a/Deeploy/Targets/Snitch/Parsers.py b/Deeploy/Targets/Snitch/Parsers.py index 0051994686..6976d8d356 100644 --- a/Deeploy/Targets/Snitch/Parsers.py +++ b/Deeploy/Targets/Snitch/Parsers.py @@ -4,10 +4,11 @@ from typing import Tuple +import numpy as np import onnx_graphsurgeon as gs -from Deeploy.DeeployTypes import NetworkContext -from Deeploy.Targets.Generic.Parsers import GEMMParser, RQGEMMParser +from Deeploy.DeeployTypes import NetworkContext, NodeParser +from Deeploy.Targets.Generic.Parsers import AddParser, DivParser, GEMMParser, MulParser, RQGEMMParser class SnitchGEMMParser(GEMMParser): @@ -72,3 +73,262 @@ def parseNodeCtxt(self, return ctxt, False return newCtxt, True + + +class SnitchRMSNormParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if node.op != 'RMSNorm': + return False + if len(node.inputs) != 2 or len(node.outputs) != 1: + return False + eps = node.attrs.get('eps', node.attrs.get('epsilon', 1e-6)) + self.operatorRepresentation['eps'] = f"{float(eps):.10e}f" + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + data_in = ctxt.lookup(node.inputs[0].name) + weight = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['weight'] = weight.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['input_shape'] = list(data_in.shape) + self.operatorRepresentation['weight_shape'] = list(weight.shape) + self.operatorRepresentation['output_shape'] = list(data_out.shape) + self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) + self.operatorRepresentation['lastDimLength'] = int(data_in.shape[-1]) + self.operatorRepresentation['input_ndim'] = len(data_in.shape) + self.operatorRepresentation['weight_ndim'] = len(weight.shape) + + return ctxt, True + + +class HardSwishParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + """Parse HardSwish node.""" + + if node.op != 'HardSwish': + return False + + # Check basic structure: 1 input and 1 output + if len(node.inputs) != 1 or len(node.outputs) != 1: + return False + + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + """Parse HardSwish node with network context.""" + + # Get input and output buffers + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + + # Store buffer names + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + + # Calculate size for memory allocation + self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) + + return ctxt, True + + +class SnitchAddParser(AddParser): + """ + Inherits from GenericAddParser and adds support for Broadcasting. + + Compatibility: + - No broadcasting: Uses the Add_fp32() fast path. + - With broadcasting: Uses the Add_fp32_broadcast() generic version. + """ + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + # Call parent method to retrieve basic information + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return ctxt, False + + # Retrieve shape information + data_in_1 = ctxt.lookup(node.inputs[0].name) + data_in_2 = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + # Correct 'size' to match the output shape (after broadcasting) + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + + # Broadcasting information + self.operatorRepresentation['shape1'] = shape1 + self.operatorRepresentation['shape2'] = shape2 + self.operatorRepresentation['out_shape'] = out_shape + self.operatorRepresentation['ndim'] = len(out_shape) + + # Determine if broadcasting is needed + need_broadcast = (shape1 != shape2) + self.operatorRepresentation['need_broadcast'] = need_broadcast + + if need_broadcast: + strides1, strides2 = self._compute_broadcast_strides(shape1, shape2, out_shape) + self.operatorRepresentation['strides1'] = strides1 + self.operatorRepresentation['strides2'] = strides2 + + return ctxt, True + + def _compute_broadcast_strides(self, shape1, shape2, out_shape): + """ + Calculates strides after broadcasting (following ONNX/NumPy rules). + + Principles: + - Align dimensions from right to left. + - When a dimension is 1, set stride to 0 to achieve the broadcasting effect. + + Example: + shape1=[8,8,8], shape2=[8] + → pad2=[1,1,8] + → strides1=[64,8,1], strides2=[0,0,1] + """ + ndim = len(out_shape) + + # Right-align and pad to the same number of dimensions + pad1 = [1] * (ndim - len(shape1)) + shape1 + pad2 = [1] * (ndim - len(shape2)) + shape2 + + def calc_strides(padded_shape, out_shape): + strides = [] + stride = 1 + for i in range(ndim - 1, -1, -1): + if padded_shape[i] == 1 and out_shape[i] > 1: + strides.insert(0, 0) # Broadcast dimension stride=0 + else: + strides.insert(0, stride) + + # Update stride multiplier only if the current dimension is essentially used (size > 1) + stride *= padded_shape[i] if padded_shape[i] > 1 else 1 + return strides + + strides1 = calc_strides(pad1, out_shape) + strides2 = calc_strides(pad2, out_shape) + + return strides1, strides2 + + +class SnitchDivParser(DivParser): + """ + Snitch-specific Div Parser. + Inherits from Generic DivParser and adds shape/broadcasting information. + """ + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + """ + Extend Generic parser to add shape and broadcasting information. + """ + # Call parent method first + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if not ret: + return ctxt, False + + # Get shape information + data_in_1 = ctxt.lookup(node.inputs[0].name) + data_in_2 = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + # Store shape information + self.operatorRepresentation['shape1'] = shape1 + self.operatorRepresentation['shape2'] = shape2 + self.operatorRepresentation['out_shape'] = out_shape + + # Calculate sizes + self.operatorRepresentation['size1'] = int(np.prod(shape1)) + self.operatorRepresentation['size2'] = int(np.prod(shape2)) + + # Update output size (may differ due to broadcasting) + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + + # Check if scalar broadcasting (input2 is scalar) + self.operatorRepresentation['is_scalar'] = (self.operatorRepresentation['size2'] == 1) + + return ctxt, True + + +class SnitchMulParser(MulParser): + """ + Snitch-specific Mul Parser. + Inherits from Generic MulParser and adds shape/broadcasting information. + """ + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + """ + Extend Generic parser to add shape and broadcasting information. + """ + # Call parent method first + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if not ret: + return ctxt, False + + # Get shape information + data_in_1 = ctxt.lookup(node.inputs[0].name) + data_in_2 = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + # Store shape information + self.operatorRepresentation['shape1'] = shape1 + self.operatorRepresentation['shape2'] = shape2 + self.operatorRepresentation['out_shape'] = out_shape + + # Calculate sizes + self.operatorRepresentation['size1'] = int(np.prod(shape1)) + self.operatorRepresentation['size2'] = int(np.prod(shape2)) + + # Update output size (may differ due to broadcasting) + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + + # Check if scalar broadcasting (input2 is scalar) + self.operatorRepresentation['is_scalar'] = (self.operatorRepresentation['size2'] == 1) + + return ctxt, True diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py index d62d1c3802..32bf53190f 100644 --- a/Deeploy/Targets/Snitch/Platform.py +++ b/Deeploy/Targets/Snitch/Platform.py @@ -2,46 +2,69 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List +from typing import List, Type import numpy as np +from Deeploy.AbstractDataTypes import Pointer, PointerClass, VoidType from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer -from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicLayerNormBindings, BasicMatMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding -from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \ - ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer -from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ - RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser +from Deeploy.Targets.Generic.Bindings import BasicConcatBindings, BasicGatherBindings, BasicLayerNormBindings, \ + BasicMatMulBindings, BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding, \ + BasicTransposeBindings +from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, DivLayer, GatherLayer, GEMMLayer, HardSwishLayer, \ + LayerNormLayer, MatMulLayer, MulLayer, PadLayer, ReshapeLayer, RMSNormLayer, RQGEMMLayer, RQIntegerDivLayer, \ + SoftmaxLayer, TransposeLayer, iNoNormLayer +from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ + ReshapeParser, RQAddParser, RQIntegerDivParser, SoftmaxParser, TransposeParser, UnsqueezeParser, iLayerNormParser, \ + iNoNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \ IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \ SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.PULPOpen.Platform import RQAddMapper -from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser +from Deeploy.Targets.Snitch.Bindings import BasicDivBindings, BasicHardSwishBindings, BasicMulBindings, \ + BasicRMSNormBindings, SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, SnitchiSoftmaxBindings, \ + SnitchRQAddBindings, SnitchRqGemmBindings +from Deeploy.Targets.Snitch.Parsers import HardSwishParser, SnitchDivParser, SnitchGEMMParser, SnitchMulParser, \ + SnitchRMSNormParser, SnitchRQGEMMParser from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate -from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchGemmTilingReadyBindings, \ - SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, SnitchRQAddTilingReadyBindings, \ - SnitchRqGemmTilingReadyBindings +# ============================================================================= +# Mappers for UNTILED mode (using BasicBindings with BasicTransformer) +# These are used by generateNetwork.py (testRunner_snitch.py) +# ============================================================================= GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings) Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings) +ReshapeMapper = NodeMapper(ReshapeParser(), BasicReshapeBindings) +TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings) +ConcatMapper = NodeMapper(ConcatParser(), BasicConcatBindings) RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding]) -MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings) -GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings) -RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings) -iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) -SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) -iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings) +# These use TiledTransformer but work in both modes (original upstream behavior) +GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmBindings) +RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmBindings) +iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxBindings) +SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxBindings) +iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormBindings) iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings) -RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings) -AddMapper = NodeMapper(AddParser(), SnitchAddTileReadyBindings) +RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddBindings) +AddMapper = NodeMapper(AddParser(), SnitchAddBindings) + +# New operators for microLlama - using BasicBindings for untiled mode +RMSNormMapper = NodeMapper(SnitchRMSNormParser(), BasicRMSNormBindings) +HardSwishMapper = NodeMapper(HardSwishParser(), BasicHardSwishBindings) +MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings) +DivMapper = NodeMapper(SnitchDivParser(), BasicDivBindings) +MulMapper = NodeMapper(SnitchMulParser(), BasicMulBindings) +# ============================================================================= +# SnitchMapping - for UNTILED mode (generateNetwork.py) +# Uses BasicBindings for new operators, TiledTransformer bindings for original ops +# ============================================================================= SnitchMapping = { 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), 'Gather': GatherLayer([GatherMapper]), @@ -56,6 +79,72 @@ 'iLayerNorm': LayerNormLayer([iLayerNormMapper]), 'RequantizedAdd': AddLayer([RQAddMapper]), 'Add': AddLayer([AddMapper]), + 'RMSNorm': RMSNormLayer([RMSNormMapper]), + 'HardSwish': HardSwishLayer([HardSwishMapper]), + 'Div': DivLayer([DivMapper]), + 'Mul': MulLayer([MulMapper]), + 'Reshape': ReshapeLayer([ReshapeMapper]), + 'Transpose': TransposeLayer([TransposeMapper]), + 'Concat': ConcatLayer([ConcatMapper]), +} + +# ============================================================================= +# Import TilingReadyBindings for TILED mode (testMVP.py) +# These will be used by TilerDeployerWrapper +# ============================================================================= +from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchConcatTilingReadyBindings, \ + SnitchDivTilingReadyBindings, SnitchGatherTilingReadyBindings, SnitchGemmTilingReadyBindings, \ + SnitchHardSwishTilingReadyBindings, SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, \ + SnitchMatMulTilingReadyBindings, SnitchMulTilingReadyBindings, SnitchReshapeTilingReadyBindings, \ + SnitchRMSNormTilingReadyBindings, SnitchRQAddTilingReadyBindings, SnitchRqGemmTilingReadyBindings, \ + SnitchTransposeTilingReadyBindings + +# ============================================================================= +# Tiled Mappers - for TILED mode (testMVP.py via TilerDeployerWrapper) +# ============================================================================= +TiledGatherMapper = NodeMapper(GatherParser(), SnitchGatherTilingReadyBindings) +TiledUnsqueezeMapper = NodeMapper(UnsqueezeParser(), SnitchReshapeTilingReadyBindings) +TiledReshapeMapper = NodeMapper(ReshapeParser(), SnitchReshapeTilingReadyBindings) +TiledTransposeMapper = NodeMapper(TransposeParser(), SnitchTransposeTilingReadyBindings) +TiledConcatMapper = NodeMapper(ConcatParser(), SnitchConcatTilingReadyBindings) +TiledMatMulMapper = NodeMapper(MatMulParser(), SnitchMatMulTilingReadyBindings) +TiledRMSNormMapper = NodeMapper(SnitchRMSNormParser(), SnitchRMSNormTilingReadyBindings) +TiledHardSwishMapper = NodeMapper(HardSwishParser(), SnitchHardSwishTilingReadyBindings) +TiledDivMapper = NodeMapper(SnitchDivParser(), SnitchDivTilingReadyBindings) +TiledMulMapper = NodeMapper(SnitchMulParser(), SnitchMulTilingReadyBindings) +TiledGemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings) +TiledRqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings) +TilediSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) +TiledSoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) +TilediNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings) +TiledRQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings) +TiledAddMapper = NodeMapper(AddParser(), SnitchAddTileReadyBindings) + +# ============================================================================= +# SnitchTiledMapping - for TILED mode (testMVP.py) +# Uses TilingReadyBindings for all operators +# ============================================================================= +SnitchTiledMapping = { + 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), + 'Gather': GatherLayer([TiledGatherMapper]), + 'Pad': PadLayer([Pad1DMapper, Pad2DMapper]), + 'Unsqueeze': ReshapeLayer([TiledUnsqueezeMapper]), + 'MatMul': MatMulLayer([TiledMatMulMapper]), + 'Gemm': GEMMLayer([TiledGemmMapper]), + 'RQGemm': RQGEMMLayer([TiledRqGemmMapper]), + 'iSoftmax': SoftmaxLayer([TilediSoftmaxMapper]), + 'Softmax': SoftmaxLayer([TiledSoftmaxMapper]), + 'iNoNorm': iNoNormLayer([TilediNoNormMapper]), + 'iLayerNorm': LayerNormLayer([iLayerNormMapper]), + 'RequantizedAdd': AddLayer([TiledRQAddMapper]), + 'Add': AddLayer([TiledAddMapper]), + 'RMSNorm': RMSNormLayer([TiledRMSNormMapper]), + 'HardSwish': HardSwishLayer([TiledHardSwishMapper]), + 'Div': DivLayer([TiledDivMapper]), + 'Mul': MulLayer([TiledMulMapper]), + 'Reshape': ReshapeLayer([TiledReshapeMapper]), + 'Transpose': TransposeLayer([TiledTransposeMapper]), + 'Concat': ConcatLayer([TiledConcatMapper]), } @@ -105,6 +194,12 @@ class SnitchConstantBuffer(ConstantBuffer): allocTemplate = AllocateTemplate.snitchL2GlobalAllocateTemplate deallocTemplate = FreeTemplate.snitchL2GlobalTemplate + def __init__(self, name: str = '', shape = [1], values = [0]): + super().__init__(name, shape, values) + # Initialize _type with a default value to prevent AttributeError + # The actual type will be set later via annotateType + self._type: Type[Pointer] = PointerClass(VoidType) + def _bufferRepresentation(self): operatorRepresentation = super()._bufferRepresentation() @@ -163,3 +258,21 @@ def __init__(self, transientBuffer = SnitchTransientBuffer, includeList: List[str] = _includeList): super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + + +class SnitchTiledClusterEngine(DeploymentEngine): + + def __init__(self, name: str, Mapping = SnitchTiledMapping, initCode = "", includeList = _includeList) -> None: + super().__init__(name, Mapping, initCode, includeList) + + +class SnitchTiledPlatform(DeploymentPlatform): + + def __init__(self, + engines = [SnitchTiledClusterEngine("SnitchCluster")], + variableBuffer = SnitchVariableBuffer, + constantBuffer = SnitchConstantBuffer, + structBuffer = SnitchStructBuffer, + transientBuffer = SnitchTransientBuffer, + includeList: List[str] = _includeList): + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) diff --git a/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py new file mode 100644 index 0000000000..5c5675c58f --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +# Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatAddTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Always initialize these variables to avoid Mako errors + operatorRepresentation.setdefault('need_broadcast', False) + operatorRepresentation.setdefault('ndim', 0) + operatorRepresentation.setdefault('strides1_str', '{}') + operatorRepresentation.setdefault('strides2_str', '{}') + operatorRepresentation.setdefault('out_shape_str', '{}') + + # If broadcasting is required, generate the stride array strings + if operatorRepresentation['need_broadcast']: + strides1 = operatorRepresentation['strides1'] + strides2 = operatorRepresentation['strides2'] + out_shape = operatorRepresentation['out_shape'] + operatorRepresentation['strides1_str'] = '{' + ', '.join(map(str, strides1)) + '}' + operatorRepresentation['strides2_str'] = '{' + ', '.join(map(str, strides2)) + '}' + operatorRepresentation['out_shape_str'] = '{' + ', '.join(map(str, out_shape)) + '}' + + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _FloatAddTemplate(""" +// Snitch FP32 Add (Name: ${nodeName}, Op: ${nodeOp}) +% if need_broadcast: +{ + uint32_t strides1[${ndim}] = ${strides1_str}; + uint32_t strides2[${ndim}] = ${strides2_str}; + uint32_t out_shape[${ndim}] = ${out_shape_str}; + Add_fp32_broadcast(${data_in_1}, ${data_in_2}, ${data_out}, out_shape, strides1, strides2, ${ndim}, ${size}); +} +% else: +Add_fp32(${data_in_1}, ${data_in_2}, ${data_out}, ${size}); +% endif +""") diff --git a/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py new file mode 100644 index 0000000000..ee35255e24 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py @@ -0,0 +1,49 @@ +# ~/Deeploy/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py + +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatDivTemplate(NodeTemplate): + """Template for FP32 Div operation with dynamic template selection.""" + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # Check if scalar broadcasting + is_scalar = operatorRepresentation.get('is_scalar', False) + + # Dynamically select template based on is_scalar flag + if is_scalar: + # Use scalar broadcasting version + self.templateStr = FloatDivScalarTemplateStr + else: + # Use element-wise version + self.templateStr = FloatDivTemplateStr + + return ctxt, operatorRepresentation, [] + + +# Template for element-wise division +FloatDivTemplateStr = r""" +Div_fp32(${input1}, ${input2}, ${output}, ${size}); +""" + +# Template for scalar broadcasting (optimized) +FloatDivScalarTemplateStr = r""" +{ + float32_t scalar = ${input2}[0]; + Div_fp32_scalar(${input1}, scalar, ${output}, ${size}); +} +""" + +# Create reference template with default (element-wise) +referenceTemplate = FloatDivTemplate(FloatDivTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py new file mode 100644 index 0000000000..1615282437 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatHardSwishTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation["data_in"]) + operatorRepresentation["size"] = int(np.prod(data_in.shape)) + + return ctxt, operatorRepresentation, [] + + +FloatHardSwishTemplateStr = r""" +HardSwish_fp32(${data_in}, ${data_out}, ${size}); +""" + +referenceTemplate = FloatHardSwishTemplate(FloatHardSwishTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py new file mode 100644 index 0000000000..0cd0a649e1 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = NodeTemplate(""" +// Matmul (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { + ${A_type.typeName} ref_${data_out}_${A} = ${A}; + ${B_type.typeName} ref_${data_out}_${B} = ${B}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for(uint32_t i=0; i<${batch}; i++){ + MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; + } +} +""") diff --git a/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py new file mode 100644 index 0000000000..7a970e6411 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatMulTemplate(NodeTemplate): + """Template for FP32 Mul operation with dynamic template selection.""" + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # Check if scalar broadcasting + is_scalar = operatorRepresentation.get('is_scalar', False) + + # Dynamically select template based on is_scalar flag + if is_scalar: + # Use scalar broadcasting version + self.templateStr = FloatMulScalarTemplateStr + else: + # Use element-wise version + self.templateStr = FloatMulTemplateStr + + return ctxt, operatorRepresentation, [] + + +# Template for element-wise multiplication +# Note: MulParser uses A, B, C for input1, input2, output respectively +FloatMulTemplateStr = r""" +Mul_fp32(${A}, ${B}, ${C}, ${size}); +""" + +# Template for scalar broadcasting (optimized) +FloatMulScalarTemplateStr = r""" +{ + float32_t scalar = ${B}[0]; + Mul_fp32_scalar(${A}, scalar, ${C}, ${size}); +} +""" + +# Create reference template with default (element-wise) +referenceTemplate = FloatMulTemplate(FloatMulTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py new file mode 100644 index 0000000000..8ae4d95e01 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatRMSNormTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation["data_in"]) + operatorRepresentation["lastDimLength"] = data_in.shape[-1] + operatorRepresentation["size"] = int(np.prod(data_in.shape)) + + return ctxt, operatorRepresentation, [] + + +FloatRMSNormTemplateStr = r""" +RMSNorm_fp32(${data_in}, ${weight}, ${data_out}, ${size}, ${lastDimLength}, ${eps}); +""" + +referenceTemplate = FloatRMSNormTemplate(FloatRMSNormTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py index 216ff35b9a..a8f32b32e3 100644 --- a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py +++ b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py @@ -25,8 +25,8 @@ def alignToContext(self, ctxt: NetworkContext, FloatSoftmaxTemplateStr = r""" - uint32_t batch_size = ${size} / ${lastDimLength}; - uint32_t compute_num = 1; //snrt_cluster_compute_core_num(); + int32_t batch_size = ${size} / ${lastDimLength}; + int32_t compute_num = 1; //snrt_cluster_compute_core_num(); int32_t ldI = compute_num * ${input_samples}; int32_t batch_offset = ${seq_len} * ${input_samples}; diff --git a/Deeploy/Targets/Snitch/Templates/GatherTemplate.py b/Deeploy/Targets/Snitch/Templates/GatherTemplate.py new file mode 100644 index 0000000000..fa4f6a2a86 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/GatherTemplate.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = NodeTemplate(""" +// Gather (Name: ${nodeName}, Op: ${nodeOp}) +<% +width = int(data_in_type.referencedType.typeWidth/8) +%> +if (snrt_cluster_core_idx() == 0) { +for (uint32_t i=0; i<${batch}; ++i) { + memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width}); +} +} +""") diff --git a/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py new file mode 100644 index 0000000000..bce916ea60 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _MatMulTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + A = ctxt.lookup(operatorRepresentation['A']) + B = ctxt.lookup(operatorRepresentation['B']) + C = ctxt.lookup(operatorRepresentation['data_out']) + operatorRepresentation['A_offset'] = 0 + operatorRepresentation['B_offset'] = 0 + operatorRepresentation['C_offset'] = 0 + if hasattr(A, "_signed") and hasattr(A, "nLevels"): + operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2) + if hasattr(B, "_signed") and hasattr(B, "nLevels"): + operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2) + if hasattr(C, "_signed") and hasattr(C, "nLevels"): + operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2) + + return ctxt, operatorRepresentation, [] + + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = _MatMulTemplate(""" +// MatMul (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { + ${A_type.typeName} ref_${data_out}_${A} = ${A}; + ${B_type.typeName} ref_${data_out}_${B} = ${B}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for(uint32_t i=0;i<${batch};i++){ + MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O}, + ${A_offset}, ${B_offset}, ${C_offset} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; + } +} +""") diff --git a/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py new file mode 100644 index 0000000000..a99573b27b --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer + + +class _SnitchReshapeTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # SCHEREMO: Selectively mark 'indices' dead, since we don't need them + if 'indices' in operatorRepresentation.keys(): + ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False + ctxt.globalObjects[operatorRepresentation['indices']]._live = False + + # Same for "shape" + if "shape" in operatorRepresentation.keys(): + ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False + ctxt.globalObjects[operatorRepresentation["shape"]]._live = False + + bufferIn = ctxt.lookup(operatorRepresentation['data_in']) + assert isinstance(bufferIn, VariableBuffer) + bufferOut = ctxt.lookup(operatorRepresentation['data_out']) + assert isinstance(bufferOut, VariableBuffer) + + # Link aliases to each buffer + bufferIn.aliases.add(bufferOut.name) + bufferOut.aliases.add(bufferIn.name) + + return ctxt, operatorRepresentation, [] + + +# Use snrt_cluster_core_idx() == 0 instead of SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = _SnitchReshapeTemplate(""" +// Reshape (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { ${data_out} = ${data_in}; } +""") diff --git a/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py new file mode 100644 index 0000000000..5e33f85aa0 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = NodeTemplate(""" +// Transpose ${data_in_shape} -> ${data_out_shape} (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { +${data_out_type.typeName} dummy_${data_out} = ${data_out}; +<% + dimStr = '' + accessStr = '' + shapeStr = '' + for dim in data_in_shape: + dimStr += '['+str(dim)+']' +%> +% for idx, i in enumerate(perm[:-1]): +<% + shapeStr += '['+str(data_in_shape[idx+1])+']' +%> +% endfor +% for idx, i in enumerate(perm): +<% + shape = data_out_shape[idx] + accessStr += '[i_'+str(idx)+']' +%> +for(uint32_t i_${i} = 0; i_${i}<${shape}; i_${i}++){ +% endfor +*dummy_${data_out}++ = ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr}; +% for idx, i in enumerate(perm): +} +% endfor +} +""") diff --git a/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py new file mode 100644 index 0000000000..b9b07be30a --- /dev/null +++ b/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py @@ -0,0 +1,112 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class FloatDivTileConstraint(TileConstraint): + """Tile constraint for FP32 Div operation supporting scalar broadcasting.""" + + dataIn1Name = "input1" + dataIn2Name = "input2" + dataOutName = "output" + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBuffer1Name = parseDict[cls.dataIn1Name] + inputBuffer2Name = parseDict[cls.dataIn2Name] + outputBufferName = parseDict[cls.dataOutName] + + input1Shape = ctxt.lookup(inputBuffer1Name).shape + input2Shape = ctxt.lookup(inputBuffer2Name).shape + + # Add tensor dimensions to model + tilerModel.addTensorDimToModel(ctxt, inputBuffer1Name) + tilerModel.addTensorDimToModel(ctxt, outputBufferName) + + # Check if input2 is scalar (total size == 1) + is_scalar = np.prod(input2Shape) == 1 + + if is_scalar: + # Scalar broadcasting: input2 is a scalar, don't tile it + # Only add input2 dimensions if it has more than 0 dims + if len(input2Shape) > 0: + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + # Constrain scalar to remain untiled (size 1) + for dim in range(len(input2Shape)): + input2DimVar = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + tilerModel.addConstraint(input2DimVar == input2Shape[dim]) + + # Input1 and output must have same dimensions + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + else: + # Element-wise: both inputs must have same shape + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + + tilerModel.addConstraint(inputDim1Var == inputDim2Var) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + # Check if scalar broadcasting + input2Name = operatorRepresentation[cls.dataIn2Name] + input2Shape = ctxt.lookup(input2Name).shape + is_scalar = np.prod(input2Shape) == 1 + + for cube in outputCubes: + newSize = np.prod(cube.dims) + replacements["size"].append(newSize) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + if is_scalar: + # For scalar, load the entire scalar tensor (size 1) + scalarCube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape)) + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: scalarCube}) + else: + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) + + for out in outputCubes: + outputLoadSchedule.append({cls.dataOutName: out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py new file mode 100644 index 0000000000..99df639004 --- /dev/null +++ b/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py @@ -0,0 +1,112 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class FloatMulTileConstraint(TileConstraint): + """Tile constraint for FP32 Mul operation supporting scalar broadcasting.""" + + dataIn1Name = "A" + dataIn2Name = "B" + dataOutName = "C" + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBuffer1Name = parseDict[cls.dataIn1Name] + inputBuffer2Name = parseDict[cls.dataIn2Name] + outputBufferName = parseDict[cls.dataOutName] + + input1Shape = ctxt.lookup(inputBuffer1Name).shape + input2Shape = ctxt.lookup(inputBuffer2Name).shape + + # Add tensor dimensions to model + tilerModel.addTensorDimToModel(ctxt, inputBuffer1Name) + tilerModel.addTensorDimToModel(ctxt, outputBufferName) + + # Check if input2 is scalar (total size == 1) + is_scalar = np.prod(input2Shape) == 1 + + if is_scalar: + # Scalar broadcasting: input2 is a scalar, don't tile it + # Only add input2 dimensions if it has more than 0 dims + if len(input2Shape) > 0: + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + # Constrain scalar to remain untiled (size 1) + for dim in range(len(input2Shape)): + input2DimVar = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + tilerModel.addConstraint(input2DimVar == input2Shape[dim]) + + # Input1 and output must have same dimensions + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + else: + # Element-wise: both inputs must have same shape + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + + tilerModel.addConstraint(inputDim1Var == inputDim2Var) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + # Check if scalar broadcasting + input2Name = operatorRepresentation[cls.dataIn2Name] + input2Shape = ctxt.lookup(input2Name).shape + is_scalar = np.prod(input2Shape) == 1 + + for cube in outputCubes: + newSize = np.prod(cube.dims) + replacements["size"].append(newSize) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + if is_scalar: + # For scalar, load the entire scalar tensor (size 1) + scalarCube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape)) + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: scalarCube}) + else: + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) + + for out in outputCubes: + outputLoadSchedule.append({cls.dataOutName: out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py new file mode 100644 index 0000000000..1bafa36e3b --- /dev/null +++ b/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py @@ -0,0 +1,143 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class ReshapeTileConstraint(TileConstraint): + """Tile constraint for Reshape operation - a NOP that just reinterprets data layout.""" + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + pointer: List[str] = [] + + for key, value in parseDict.items(): + if not isinstance(value, str): + continue + + if ctxt.is_global(value) or ctxt.is_local(value): + pointer.append(value) + + # Add I/O dimensions to the model as variables + for bufferName in [inputBufferName, outputBufferName]: + _buffer = ctxt.lookup(bufferName) + tilerModel.addTensorDimToModel(ctxt, bufferName) + + for idx, shapeDim in enumerate(_buffer.shape): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = bufferName, dimIdx = idx) <= shapeDim) + + # Constrain total elements to be equal + inputBuffer = ctxt.lookup(inputBufferName) + outputBuffer = ctxt.lookup(outputBufferName) + + # For reshape, we want the tiles to have the same total number of elements + # This is automatically satisfied if we tile based on output and compute input from that + + # Remove unused tensors from deployment + for bufferName in pointer: + if bufferName not in [inputBufferName, outputBufferName]: + ctxt.lookup(bufferName)._deploy = False + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # For reshape, input and output have the same data, just different interpretations + # We need to compute the corresponding input cube for each output cube + inputName = operatorRepresentation['data_in'] + outputName = operatorRepresentation['data_out'] + inputShape = ctxt.lookup(inputName).shape + outputShape = ctxt.lookup(outputName).shape + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + # Calculate the flat offset and size for the output cube + outSize = np.prod(cube.dims) + replacements["size"].append(outSize) + + # For reshape, we need to map output cube to input cube + # Calculate flat index range for output cube + outOffset = 0 + outStrides = [] + stride = 1 + for dim in reversed(outputShape): + outStrides.insert(0, stride) + stride *= dim + + for i, (off, dim) in enumerate(zip(cube.offset, cube.dims)): + outOffset += off * outStrides[i] + + # Convert flat offset to input coordinates + inStrides = [] + stride = 1 + for dim in reversed(inputShape): + inStrides.insert(0, stride) + stride *= dim + + inOffset = [] + remaining = outOffset + for i, stride in enumerate(inStrides): + inOffset.append(remaining // stride) + remaining = remaining % stride + + # Calculate input cube dimensions + # For simplicity, treat as 1D cube in input space + inCubeDims = list(inputShape) + inCubeOffset = [0] * len(inputShape) + + # Set the last dimension to the size, and offset based on flat index + totalSize = outSize + if len(inputShape) > 0: + # Compute proper input cube that covers the same elements + # Use a simple approach: linearize the input + inCubeOffset = list(inOffset) + inCubeDims = [1] * len(inputShape) + inCubeDims[-1] = min(totalSize, inputShape[-1] - inCubeOffset[-1]) + remaining = totalSize - inCubeDims[-1] + + for i in range(len(inputShape) - 2, -1, -1): + if remaining <= 0: + break + inCubeDims[i] = min(remaining // np.prod(inputShape[i + 1:]) + 1, inputShape[i]) + remaining -= (inCubeDims[i] - 1) * np.prod(inputShape[i + 1:]) + + inputCube = HyperRectangle(tuple(inCubeOffset), tuple(inCubeDims)) + inputLoadSchedule.append({"data_in": inputCube}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Snitch/TileConstraints/__init__.py b/Deeploy/Targets/Snitch/TileConstraints/__init__.py index 947a6fd82a..aece19d881 100644 --- a/Deeploy/Targets/Snitch/TileConstraints/__init__.py +++ b/Deeploy/Targets/Snitch/TileConstraints/__init__.py @@ -3,5 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from . import * +from .FloatDivTileConstraint import * +from .FloatMulTileConstraint import * from .iNoNormTileConstraint import * from .iSoftmaxTileConstraint import * +from .ReshapeTileConstraint import * diff --git a/Deeploy/Targets/Snitch/Tiler.py b/Deeploy/Targets/Snitch/Tiler.py index 475a425779..5a5f4d0bf4 100644 --- a/Deeploy/Targets/Snitch/Tiler.py +++ b/Deeploy/Targets/Snitch/Tiler.py @@ -3,10 +3,21 @@ # SPDX-License-Identifier: Apache-2.0 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint -from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, \ - SnitchiSoftmaxBindings, SnitchRQAddBindings, SnitchRqGemmBindings +from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iRMSNormTileConstraint import iRMSNormTileConstraint +from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint +from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchConcatBindings, SnitchDivBindings, \ + SnitchGatherBindings, SnitchGemmBindings, SnitchHardSwishBindings, SnitchiNoNormBindings, SnitchiSoftmaxBindings, \ + SnitchMatMulBindings, SnitchMulBindings, SnitchReshapeBindings, SnitchRMSNormBindings, SnitchRQAddBindings, \ + SnitchRqGemmBindings, SnitchTransposeBindings from Deeploy.Targets.Snitch.TileConstraints import iNoNormTileConstraint, iSoftmaxTileConstraint +from Deeploy.Targets.Snitch.TileConstraints.FloatDivTileConstraint import FloatDivTileConstraint +from Deeploy.Targets.Snitch.TileConstraints.FloatMulTileConstraint import FloatMulTileConstraint from Deeploy.Targets.Snitch.TileConstraints.GemmTileConstraint import GemmTileConstraint +from Deeploy.Targets.Snitch.TileConstraints.ReshapeTileConstraint import ReshapeTileConstraint from Deeploy.Targets.Snitch.TileConstraints.RqGemmTileConstraint import RqGemmTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings @@ -23,3 +34,30 @@ SnitchAddTileReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchAddBindings, tileConstraint = AddTileConstraint()) + +SnitchRMSNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchRMSNormBindings, + tileConstraint = iRMSNormTileConstraint()) + +SnitchHardSwishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchHardSwishBindings, + tileConstraint = iHardswishTileConstraint()) + +SnitchDivTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchDivBindings, + tileConstraint = FloatDivTileConstraint()) + +SnitchMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMulBindings, + tileConstraint = FloatMulTileConstraint()) + +SnitchMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMatMulBindings, + tileConstraint = MatMulTileConstraint()) + +SnitchConcatTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchConcatBindings, + tileConstraint = ConcatTileConstraint()) + +SnitchTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchTransposeBindings, + tileConstraint = TransposeTileConstraint()) + +SnitchReshapeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchReshapeBindings, + tileConstraint = ReshapeTileConstraint()) + +SnitchGatherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchGatherBindings, + tileConstraint = GatherTileConstraint()) diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz new file mode 100644 index 0000000000..eec4cee600 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx new file mode 100644 index 0000000000..7a146e5541 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx @@ -0,0 +1,14 @@ + +hardswish_test_fp32: +* +inputoutputHardSwish_node" HardSwishhardswish_graph_fp32Z +input + + + +€b +output + + + +€B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz new file mode 100644 index 0000000000..074c937f5b Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz new file mode 100644 index 0000000000..9d14ca82f7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx new file mode 100644 index 0000000000..25a7a9b683 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz new file mode 100644 index 0000000000..6167f74042 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz differ diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz new file mode 100644 index 0000000000..d077979636 Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz differ diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz new file mode 100644 index 0000000000..89c505c669 Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz differ diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/network.onnx new file mode 100644 index 0000000000..30b6d8420f Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/network.onnx differ diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz new file mode 100644 index 0000000000..07fda6854d Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz differ diff --git a/DeeployTest/testRunner_tiled_snitch.py b/DeeployTest/testRunner_tiled_snitch.py index 7787d1f844..cf6ac6b2e0 100644 --- a/DeeployTest/testRunner_tiled_snitch.py +++ b/DeeployTest/testRunner_tiled_snitch.py @@ -25,7 +25,10 @@ args = parser.parse_args() - testRunner = TestRunner(platform = "Snitch", simulator = args.simulator, tiling = True, argument_parser = parser) + testRunner = TestRunner(platform = "Snitch_tiled", + simulator = args.simulator, + tiling = True, + argument_parser = parser) testRunner.cmake_args += f" -D NUM_CORES={args.cores}" diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 48c5777905..9d562cf577 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -24,12 +24,12 @@ from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform from Deeploy.Targets.Snitch.Deployer import SnitchDeployer -from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform +from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform, SnitchTiledPlatform from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Snitch_tiled", "Chimera"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -65,6 +65,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Snitch": Platform = SnitchPlatform() + elif platformName == "Snitch_tiled": + Platform = SnitchTiledPlatform() + elif platformName == "SoftHier": Platform = SoftHierPlatform() @@ -217,7 +220,7 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) - elif isinstance(platform, (SnitchPlatform)): + elif isinstance(platform, (SnitchPlatform, SnitchTiledPlatform)): if loweringOptimizer is None: loweringOptimizer = SnitchOptimizer diff --git a/DeeployTest/testUtils/typeMapping.py b/DeeployTest/testUtils/typeMapping.py index 232fd1e274..b6851dec7e 100644 --- a/DeeployTest/testUtils/typeMapping.py +++ b/DeeployTest/testUtils/typeMapping.py @@ -48,7 +48,11 @@ def inferMinimalType(values: np.ndarray, default: Type[BaseType] = int8_t) -> Ty print(f"Warning: Empty input array for type inference for {values}!") return default - if isInteger(values): + # First check the numpy dtype - if it's a float type, use float even if values are integer-like + # This handles cases like [0.0, 0.0] which would otherwise be incorrectly typed as uint8_t + if np.issubdtype(values.dtype, np.floating): + return minimalFloatType(values) + elif isInteger(values): return minimalIntegerType(values) else: return minimalFloatType(values) diff --git a/TargetLibraries/Generic/inc/macros.h b/TargetLibraries/Generic/inc/macros.h index d97cfecb7c..0b5a0e51fb 100644 --- a/TargetLibraries/Generic/inc/macros.h +++ b/TargetLibraries/Generic/inc/macros.h @@ -7,22 +7,28 @@ #ifndef __DEEPLOY_BASIC_MATH_MACROS_HEADER_ #define __DEEPLOY_BASIC_MATH_MACROS_HEADER_ +#ifndef MAX #define MAX(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a > _b ? _a : _b; \ }) +#endif +#ifndef MIN #define MIN(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a < _b ? _a : _b; \ }) +#endif +#ifndef CLAMP #define CLAMP(x, low, high) \ (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) +#endif #define inf 1.0f / 0.0f diff --git a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h index e44d3c20c6..1305ba6bff 100644 --- a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h +++ b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h @@ -23,8 +23,13 @@ #include "snrt.h" +#include "kernel/Add.h" +#include "kernel/Div.h" #include "kernel/Gemm.h" +#include "kernel/HardSwish.h" #include "kernel/MatMul.h" +#include "kernel/Mul.h" +#include "kernel/RMSNrom.h" #include "kernel/RQGemm.h" #include "kernel/RQMatMul.h" #include "kernel/Softmax.h" diff --git a/TargetLibraries/Snitch/inc/kernel/Add.h b/TargetLibraries/Snitch/inc/kernel/Add.h new file mode 100644 index 0000000000..7a65e82712 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Add.h @@ -0,0 +1,21 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_ADD_KERNEL_HEADER_ +#define __DEEPLOY_MATH_ADD_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, uint32_t size); + +void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t *out_shape, uint32_t *strides1, + uint32_t *strides2, uint32_t ndim, uint32_t size); + +void Add_fp32_lastdim(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t outer_size, uint32_t inner_size); + +#endif // __DEEPLOY_MATH_ADD_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Div.h b/TargetLibraries/Snitch/inc/kernel/Div.h new file mode 100644 index 0000000000..e9b257a634 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Div.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ +#define __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Division (FP32) + * + * Computes: output[i] = input1[i] / input2[i] + * + * input1: Numerator tensor (float32) + * input2: Denominator tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size); + +/* + * Element-wise Division with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] / scalar + * + * input1: Numerator tensor (float32) + * scalar: Scalar denominator (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size); + +#endif // __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/HardSwish.h b/TargetLibraries/Snitch/inc/kernel/HardSwish.h new file mode 100644 index 0000000000..a0cfdaac12 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/HardSwish.h @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ +#define __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * HardSwish Activation Function + * + * Computes: HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + * + * Piecewise form: + * - When x <= -3: output = 0 + * - When -3 < x < 3: output = x * (x/6 + 0.5) + * - When x >= 3: output = x + * + * This is a computationally efficient approximation of Swish/SiLU activation + * commonly used in mobile neural networks and transformer models. + * + * data_in: Input tensor (FP32) + * data_out: Output tensor (FP32, same shape as input) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size); + +#endif // __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Mul.h b/TargetLibraries/Snitch/inc/kernel/Mul.h new file mode 100644 index 0000000000..d851e2e3bf --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Mul.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ +#define __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Multiplication (FP32) + * + * Computes: output[i] = input1[i] * input2[i] + * + * input1: First input tensor (float32) + * input2: Second input tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size); + +/* + * Element-wise Multiplication with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] * scalar + * + * input1: Input tensor (float32) + * scalar: Scalar multiplier (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size); + +#endif // __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/RMSNrom.h b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h new file mode 100644 index 0000000000..16e25cd38c --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ +#define __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * RMS Normalization (Root Mean Square Normalization) + * + * Computes: output[i] = (input[i] / rms) * weight[i] + * where rms = sqrt(mean(input^2) + eps) + * + * data_in: Input tensor [batch, seq, hidden] or flattened [size] + * weight: Weight tensor [hidden_dim] + * data_out: Output tensor (same shape as input) + * size: Total number of elements (batch * seq * hidden) + * lastDimLength: Hidden dimension size + * eps: Epsilon for numerical stability (typically 1e-6) + * + * multi-core = yes + * parallelization = vector-wise (across batch * sequence) + */ +void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out, + uint32_t size, uint32_t lastDimLength, float32_t eps); + +#endif // __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Softmax.h b/TargetLibraries/Snitch/inc/kernel/Softmax.h index c2d7596e7a..3795bb4f3b 100644 --- a/TargetLibraries/Snitch/inc/kernel/Softmax.h +++ b/TargetLibraries/Snitch/inc/kernel/Softmax.h @@ -9,7 +9,7 @@ #include "DeeploySnitchMath.h" -void softmax_fp32(float *input, float *output, int32_t ldI, +void Softmax_fp32(float *input, float *output, int32_t ldI, int32_t batch_offset, int32_t batch_size, int32_t seq_len, int32_t input_samples); diff --git a/TargetLibraries/Snitch/inc/macros.h b/TargetLibraries/Snitch/inc/macros.h index bc1191d25a..04bef2394b 100644 --- a/TargetLibraries/Snitch/inc/macros.h +++ b/TargetLibraries/Snitch/inc/macros.h @@ -8,10 +8,19 @@ #define __DEEPLOY_MATH_MACROS_HEADER_ #define INT_LOG2(x) __builtin_ctz(x) + +#ifndef CLAMP #define CLAMP(x, low, high) \ (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) +#endif + +#ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#ifndef MAX #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif // JUNGVI: The following macros are here to ensure compatibility with some // PULP-NN kernels diff --git a/TargetLibraries/Snitch/src/Add_fp32.c b/TargetLibraries/Snitch/src/Add_fp32.c new file mode 100644 index 0000000000..235b258511 --- /dev/null +++ b/TargetLibraries/Snitch/src/Add_fp32.c @@ -0,0 +1,102 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + pOut[i] = pIn1[i] + pIn2[i]; + } +} + +void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t *out_shape, uint32_t *strides1, + uint32_t *strides2, uint32_t ndim, uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t tmp = i; + + for (int32_t d = ndim - 1; d >= 0; d--) { + uint32_t coord = tmp % out_shape[d]; + tmp /= out_shape[d]; + idx1 += coord * strides1[d]; + idx2 += coord * strides2[d]; + } + + pOut[i] = pIn1[idx1] + pIn2[idx2]; + } +} + +void Add_fp32_lastdim(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t outer_size, uint32_t inner_size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + uint32_t size = outer_size * inner_size; + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + uint32_t inner_idx = i % inner_size; + pOut[i] = pIn1[i] + pIn2[inner_idx]; + } +} diff --git a/TargetLibraries/Snitch/src/CycleCounter.c b/TargetLibraries/Snitch/src/CycleCounter.c index 3861c421c1..8a99c312e6 100644 --- a/TargetLibraries/Snitch/src/CycleCounter.c +++ b/TargetLibraries/Snitch/src/CycleCounter.c @@ -6,10 +6,15 @@ #include "DeeploySnitchMath.h" +// Define ENABLE_INSTR_COUNTER to enable instruction counting (causes warnings +// in gvsoc) #define ENABLE_INSTR_COUNTER + static uint32_t timer_init[NUM_CORES] __attribute__((section(".l1"))); static uint32_t timer_end[NUM_CORES] __attribute__((section(".l1"))); +#ifdef ENABLE_INSTR_COUNTER static uint32_t instr_init[NUM_CORES] __attribute__((section(".l1"))); static uint32_t instr_end[NUM_CORES] __attribute__((section(".l1"))); +#endif static uint32_t running[NUM_CORES] __attribute__((section(".l1"))); @@ -17,11 +22,13 @@ void ResetTimer() { snrt_reset_perf_counter(SNRT_PERF_CNT0); uint32_t const core_id = snrt_global_core_idx(); uint32_t _timer_init = read_csr(mcycle); - uint32_t _instr_init = read_csr(minstret); timer_init[core_id] = _timer_init; - instr_init[core_id] = _instr_init; timer_end[core_id] = _timer_init; +#ifdef ENABLE_INSTR_COUNTER + uint32_t _instr_init = read_csr(minstret); + instr_init[core_id] = _instr_init; instr_end[core_id] = _instr_init; +#endif running[core_id] = 0; } @@ -31,7 +38,9 @@ void StartTimer() { } uint32_t const core_id = snrt_global_core_idx(); timer_init[core_id] = read_csr(mcycle); +#ifdef ENABLE_INSTR_COUNTER instr_init[core_id] = read_csr(minstret); +#endif running[core_id] = 1; } @@ -41,7 +50,9 @@ void StopTimer() { } uint32_t const core_id = snrt_global_core_idx(); timer_end[core_id] = read_csr(mcycle); +#ifdef ENABLE_INSTR_COUNTER instr_end[core_id] = read_csr(minstret); +#endif running[core_id] = 0; } @@ -55,6 +66,7 @@ uint32_t getCycles() { } uint32_t getInstr(void) { +#ifdef ENABLE_INSTR_COUNTER uint32_t const core_id = snrt_global_core_idx(); if (running[core_id]) { @@ -62,4 +74,7 @@ uint32_t getInstr(void) { } else { return instr_end[core_id] - instr_init[core_id]; } +#else + return 0; // Instruction counting disabled +#endif } \ No newline at end of file diff --git a/TargetLibraries/Snitch/src/Div_fp32.c b/TargetLibraries/Snitch/src/Div_fp32.c new file mode 100644 index 0000000000..07c3d3c5d4 --- /dev/null +++ b/TargetLibraries/Snitch/src/Div_fp32.c @@ -0,0 +1,89 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Division (FP32) + * + * Computes: output[i] = input1[i] / input2[i] + * + * Supports ONNX broadcasting rules: + * - If input2 is scalar (size=1): divides all elements of input1 by input2[0] + * - If both have same size: element-wise division + * + * input1: Numerator tensor (float32) + * input2: Denominator tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise across input1 + */ +void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize across elements + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + // Check if input2 is a scalar (size=1, broadcasted) + // Note: This assumes the parser has set input2_size correctly + // For now, we assume element-wise division (same size) + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] / input2[i]; + } +} + +/* + * Element-wise Division with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] / scalar + * + * input1: Numerator tensor (float32) + * scalar: Scalar denominator (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + float32_t inv_scalar = 1.0f / scalar; // Compute inverse once + + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * inv_scalar; + } +} diff --git a/TargetLibraries/Snitch/src/Gemm_fp32.c b/TargetLibraries/Snitch/src/Gemm_fp32.c index 9a79538e12..8dac98ef67 100644 --- a/TargetLibraries/Snitch/src/Gemm_fp32.c +++ b/TargetLibraries/Snitch/src/Gemm_fp32.c @@ -11,231 +11,50 @@ void gemm_fp32_transB_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A, uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C, uint32_t ldC, float32_t *Y, uint32_t BETA, uint32_t setup_SSR) { + (void)setup_SSR; uint32_t compute_id = snrt_global_compute_core_idx(); uint32_t A_offset = K * compute_id; uint32_t C_offset = N * compute_id; - // Unrolling factor of most inner loop. - // Should be at least as high as the FMA delay - // for maximum utilization - const uint32_t unroll = 8; - - // SSR strides and bounds only have to be configured - // once in the beginning - if (setup_SSR) { - // First matrix is not stored in transposed format - const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr0_i[4] = {0, sizeof(float32_t), 0, - sizeof(float32_t) * ldA}; - - // Second matrix is stored in transposed format - const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr1_i[4] = {sizeof(float32_t) * K, sizeof(float32_t), - sizeof(float32_t) * K * unroll, 0}; - - snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], - ssr0_i[2], ssr0_i[3]); - - snrt_ssr_repeat(SNRT_SSR_DM0, unroll); - snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], - ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); - } - - // SSR start address need to be configured each time - - snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, &A[A_offset]); - snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B); - snrt_ssr_enable(); - - // check dimensions and values of a and b - - // Kernel progresses by 1 values each step - // const uint32_t n_frep = K - 1; for (uint32_t m = 0; m < M; m++) { - uint32_t n = 0; - for (uint32_t n0 = 0; n0 < N / unroll; n0++) { - float c[unroll]; - - // Load intermediate result - if (BETA) { - c[0] = C[C_offset + m * ldC + n + 0]; - c[1] = C[C_offset + m * ldC + n + 1]; - c[2] = C[C_offset + m * ldC + n + 2]; - c[3] = C[C_offset + m * ldC + n + 3]; - c[4] = C[C_offset + m * ldC + n + 4]; - c[5] = C[C_offset + m * ldC + n + 5]; - c[6] = C[C_offset + m * ldC + n + 6]; - c[7] = C[C_offset + m * ldC + n + 7]; - } else { - c[0] = 0.0; - c[1] = 0.0; - c[2] = 0.0; - c[3] = 0.0; - c[4] = 0.0; - c[5] = 0.0; - c[6] = 0.0; - c[7] = 0.0; - } - - asm volatile( - "frep.o %[n_frep], 8, 0, 0 \n" - "fmadd.s %[c0], ft0, ft1, %[c0] \n" - "fmadd.s %[c1], ft0, ft1, %[c1] \n" - "fmadd.s %[c2], ft0, ft1, %[c2] \n" - "fmadd.s %[c3], ft0, ft1, %[c3] \n" - "fmadd.s %[c4], ft0, ft1, %[c4] \n" - "fmadd.s %[c5], ft0, ft1, %[c5] \n" - "fmadd.s %[c6], ft0, ft1, %[c6] \n" - "fmadd.s %[c7], ft0, ft1, %[c7] \n" - : [c0] "+f"(c[0]), [c1] "+f"(c[1]), [c2] "+f"(c[2]), [c3] "+f"(c[3]), - [c4] "+f"(c[4]), [c5] "+f"(c[5]), [c6] "+f"(c[6]), [c7] "+f"(c[7]) - : [n_frep] "r"(K - 1) - : "ft0", "ft1", "ft2"); - - // Store results back - Y[C_offset + m * ldC + n + 0] = c[0]; - Y[C_offset + m * ldC + n + 1] = c[1]; - Y[C_offset + m * ldC + n + 2] = c[2]; - Y[C_offset + m * ldC + n + 3] = c[3]; - Y[C_offset + m * ldC + n + 4] = c[4]; - Y[C_offset + m * ldC + n + 5] = c[5]; - Y[C_offset + m * ldC + n + 6] = c[6]; - Y[C_offset + m * ldC + n + 7] = c[7]; - n += unroll; - } - - // Clean up of leftover columns - snrt_ssr_disable(); - for (; n < N; n++) { + for (uint32_t n = 0; n < N; n++) { float32_t c; if (BETA) { c = C[C_offset + m * ldC + n]; } else { - c = 0.0; + c = 0.0f; } for (uint32_t k = 0; k < K; k++) { - c += A[A_offset + k + m * ldA] * B[k + n * ldB]; + c += A[A_offset + m * ldA + k] * B[n * ldB + k]; } Y[C_offset + m * ldC + n] = c; } - snrt_ssr_enable(); } - snrt_ssr_disable(); } void gemm_fp32_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A, uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C, uint32_t ldC, float32_t *Y, uint32_t BETA, uint32_t setup_SSR) { + (void)setup_SSR; + uint32_t compute_id = snrt_global_compute_core_idx(); uint32_t A_offset = K * compute_id; uint32_t C_offset = N * compute_id; - // Unrolling factor of most inner loop. - // Should be at least as high as the FMA delay - // for maximum utilization - const uint32_t unroll = 8; - - // SSR strides and bounds only have to be configured - // once in the beginning - if (setup_SSR) { - // First matrix is not stored in transposed format - const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr0_i[4] = {0, sizeof(float32_t), 0, - sizeof(float32_t) * ldA}; - - // Second matrix is not stored in transposed format - const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr1_i[4] = {sizeof(float32_t), sizeof(float32_t) * ldB, - sizeof(float32_t) * unroll, 0}; - - snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], - ssr0_i[2], ssr0_i[3]); - - snrt_ssr_repeat(SNRT_SSR_DM0, unroll); - snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], - ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); - } - - // SSR start address need to be configured each time - - snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, &A[A_offset]); - snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B); - snrt_ssr_enable(); - - // check dimensions and values of a and b - - // Kernel progresses by 1 values each step - // const uint32_t n_frep = K - 1; for (uint32_t m = 0; m < M; m++) { - uint32_t n = 0; - for (uint32_t n0 = 0; n0 < N / unroll; n0++) { - float c[unroll]; - - // Load intermediate result - if (BETA) { - c[0] = C[C_offset + m * ldC + n + 0]; - c[1] = C[C_offset + m * ldC + n + 1]; - c[2] = C[C_offset + m * ldC + n + 2]; - c[3] = C[C_offset + m * ldC + n + 3]; - c[4] = C[C_offset + m * ldC + n + 4]; - c[5] = C[C_offset + m * ldC + n + 5]; - c[6] = C[C_offset + m * ldC + n + 6]; - c[7] = C[C_offset + m * ldC + n + 7]; - } else { - c[0] = 0.0; - c[1] = 0.0; - c[2] = 0.0; - c[3] = 0.0; - c[4] = 0.0; - c[5] = 0.0; - c[6] = 0.0; - c[7] = 0.0; - } - - asm volatile( - "frep.o %[n_frep], 8, 0, 0 \n" - "fmadd.s %[c0], ft0, ft1, %[c0] \n" - "fmadd.s %[c1], ft0, ft1, %[c1] \n" - "fmadd.s %[c2], ft0, ft1, %[c2] \n" - "fmadd.s %[c3], ft0, ft1, %[c3] \n" - "fmadd.s %[c4], ft0, ft1, %[c4] \n" - "fmadd.s %[c5], ft0, ft1, %[c5] \n" - "fmadd.s %[c6], ft0, ft1, %[c6] \n" - "fmadd.s %[c7], ft0, ft1, %[c7] \n" - : [c0] "+f"(c[0]), [c1] "+f"(c[1]), [c2] "+f"(c[2]), [c3] "+f"(c[3]), - [c4] "+f"(c[4]), [c5] "+f"(c[5]), [c6] "+f"(c[6]), [c7] "+f"(c[7]) - : [n_frep] "r"(K - 1) - : "ft0", "ft1", "ft2"); - - // Store results back - Y[C_offset + m * ldC + n + 0] = c[0]; - Y[C_offset + m * ldC + n + 1] = c[1]; - Y[C_offset + m * ldC + n + 2] = c[2]; - Y[C_offset + m * ldC + n + 3] = c[3]; - Y[C_offset + m * ldC + n + 4] = c[4]; - Y[C_offset + m * ldC + n + 5] = c[5]; - Y[C_offset + m * ldC + n + 6] = c[6]; - Y[C_offset + m * ldC + n + 7] = c[7]; - n += unroll; - } - - // Clean up of leftover columns - snrt_ssr_disable(); - for (; n < N; n++) { + for (uint32_t n = 0; n < N; n++) { float32_t c; if (BETA) { c = C[C_offset + m * ldC + n]; } else { - c = 0.0; + c = 0.0f; } for (uint32_t k = 0; k < K; k++) { - c += A[A_offset + k + m * ldA] * B[k * ldB + n]; + c += A[A_offset + m * ldA + k] * B[k * ldB + n]; } Y[C_offset + m * ldC + n] = c; } - snrt_ssr_enable(); } - snrt_ssr_disable(); } diff --git a/TargetLibraries/Snitch/src/HardSwish.c b/TargetLibraries/Snitch/src/HardSwish.c new file mode 100644 index 0000000000..b7e9679c64 --- /dev/null +++ b/TargetLibraries/Snitch/src/HardSwish.c @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize by dividing work across cores + uint32_t chunk_size = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, end; + if (core_id < remainder) { + chunk_size += 1; + start = core_id * chunk_size; + } else { + start = core_id * chunk_size + remainder; + } + end = start + chunk_size; + + // HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + // Piecewise: + // x <= -3: output = 0 + // -3 < x < 3: output = x * (x/6 + 0.5) + // x >= 3: output = x + + for (uint32_t i = start; i < end; i++) { + float32_t x = data_in[i]; + float32_t clip_val = x / 6.0f + 0.5f; + + // Clamp to [0, 1] + if (clip_val < 0.0f) { + clip_val = 0.0f; + } else if (clip_val > 1.0f) { + clip_val = 1.0f; + } + + data_out[i] = x * clip_val; + } +} diff --git a/TargetLibraries/Snitch/src/Mul_fp32.c b/TargetLibraries/Snitch/src/Mul_fp32.c new file mode 100644 index 0000000000..80d6bc9b33 --- /dev/null +++ b/TargetLibraries/Snitch/src/Mul_fp32.c @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Multiplication (FP32) + * + * Computes: output[i] = input1[i] * input2[i] + * + * Supports ONNX broadcasting rules: + * - If input2 is scalar (size=1): multiplies all elements of input1 by + * input2[0] + * - If both have same size: element-wise multiplication + * + * input1: First input tensor (float32) + * input2: Second input tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise across input1 + */ +void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize across elements + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + // Element-wise multiplication + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * input2[i]; + } +} + +/* + * Element-wise Multiplication with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] * scalar + * + * input1: Input tensor (float32) + * scalar: Scalar multiplier (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * scalar; + } +} diff --git a/TargetLibraries/Snitch/src/RMSNrom_fp32.c b/TargetLibraries/Snitch/src/RMSNrom_fp32.c new file mode 100644 index 0000000000..9c615ce923 --- /dev/null +++ b/TargetLibraries/Snitch/src/RMSNrom_fp32.c @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" +#include + +void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out, + uint32_t size, uint32_t lastDimLength, float32_t eps) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t num_vectors = size / lastDimLength; + + // Parallelize across vectors (batch * sequence dimension) + uint32_t vectors_per_core = num_vectors / numThreads; + uint32_t remainder = num_vectors % numThreads; + + uint32_t start_vec, num_vecs; + if (core_id < remainder) { + num_vecs = vectors_per_core + 1; + start_vec = core_id * num_vecs; + } else { + num_vecs = vectors_per_core; + start_vec = core_id * vectors_per_core + remainder; + } + + for (uint32_t v = start_vec; v < start_vec + num_vecs; v++) { + float32_t *in_ptr = data_in + v * lastDimLength; + float32_t *out_ptr = data_out + v * lastDimLength; + + // Compute sum of squares + float32_t sum_sq = 0.0f; + for (uint32_t i = 0; i < lastDimLength; i++) { + sum_sq += in_ptr[i] * in_ptr[i]; + } + + // Compute RMS with epsilon + float32_t rms = sqrtf(sum_sq / (float32_t)lastDimLength + eps); + float32_t inv_rms = 1.0f / rms; + + // Apply normalization and weight + for (uint32_t i = 0; i < lastDimLength; i++) { + out_ptr[i] = in_ptr[i] * inv_rms * weight[i]; + } + } +}