From d2d43db14633237ae0cf83d22098b8c371a82352 Mon Sep 17 00:00:00 2001 From: lee2716 Date: Sat, 31 Jan 2026 12:09:15 +0100 Subject: [PATCH 1/2] Add FP32 operators for MicroLlama on Snitch (untiled) Add support for FP32 operators required by MicroLlama model: - RMSNorm: Fused RMS normalization - HardSwish: Activation function - Div: Element-wise division - Mul: Element-wise multiplication - MatMul: Matrix multiplication - Add: Element-wise addition (FP32 support) - Reshape, Transpose, Concat, Gather: Shape operations Components added: - Generic: Parsers, TypeCheckers, Layers, Bindings - Snitch Templates: FloatAdd, FloatDiv, FloatHardSwish, FloatMul, FloatRMSNorm, FloatMatMul, Reshape, Transpose, Gather - Snitch Kernels: C implementations for all FP32 operators - Test data: Hardswish, RMSNorm_fused kernels, microLlama_fp32_1 model This enables running MicroLlama FP32 model on Snitch in untiled mode: python testRunner_snitch.py -t Tests/Models/microLlama/microLlama_fp32_1 --- Deeploy/Targets/Generic/Bindings.py | 3 + Deeploy/Targets/Generic/Layers.py | 28 ++ Deeploy/Targets/Generic/Parsers.py | 47 +++- Deeploy/Targets/Generic/TypeCheckers.py | 51 ++++ Deeploy/Targets/Snitch/Bindings.py | 122 +++++++- Deeploy/Targets/Snitch/Parsers.py | 264 +++++++++++++++++- Deeploy/Targets/Snitch/Platform.py | 151 ++++++++-- .../Snitch/Templates/FloatAddTemplate.py | 46 +++ .../Snitch/Templates/FloatDivTemplate.py | 49 ++++ .../Templates/FloatHardSwishTemplate.py | 30 ++ .../Snitch/Templates/FloatMatMulTemplate.py | 30 ++ .../Snitch/Templates/FloatMulTemplate.py | 48 ++++ .../Snitch/Templates/FloatRMSNormTemplate.py | 31 ++ .../Snitch/Templates/FloatSoftmaxTemplate.py | 4 +- .../Snitch/Templates/GatherTemplate.py | 18 ++ .../Snitch/Templates/MatMulTemplate.py | 58 ++++ .../Snitch/Templates/ReshapeTemplate.py | 44 +++ .../Snitch/Templates/TransposeTemplate.py | 36 +++ .../Tests/Kernels/FP32/Hardswish/inputs.npz | Bin 0 -> 1288 bytes .../Tests/Kernels/FP32/Hardswish/network.onnx | 14 + .../Tests/Kernels/FP32/Hardswish/outputs.npz | Bin 0 -> 1290 bytes .../Kernels/FP32/RMSNorm_fused/inputs.npz | Bin 0 -> 2312 bytes .../Kernels/FP32/RMSNorm_fused/network.onnx | Bin 0 -> 220 bytes .../Kernels/FP32/RMSNorm_fused/outputs.npz | Bin 0 -> 2314 bytes .../microLlama_fp32_1/activations.npz | Bin 0 -> 355738 bytes .../microLlama/microLlama_fp32_1/inputs.npz | Bin 0 -> 9138 bytes .../microLlama/microLlama_fp32_1/network.onnx | Bin 0 -> 2186293 bytes .../microLlama/microLlama_fp32_1/outputs.npz | Bin 0 -> 526 bytes DeeployTest/testUtils/typeMapping.py | 6 +- TargetLibraries/Generic/inc/macros.h | 6 + .../Snitch/inc/DeeploySnitchMath.h | 5 + TargetLibraries/Snitch/inc/kernel/Add.h | 21 ++ TargetLibraries/Snitch/inc/kernel/Div.h | 44 +++ TargetLibraries/Snitch/inc/kernel/HardSwish.h | 34 +++ TargetLibraries/Snitch/inc/kernel/Mul.h | 44 +++ TargetLibraries/Snitch/inc/kernel/RMSNrom.h | 31 ++ TargetLibraries/Snitch/inc/kernel/Softmax.h | 2 +- TargetLibraries/Snitch/inc/macros.h | 9 + TargetLibraries/Snitch/src/Add_fp32.c | 102 +++++++ TargetLibraries/Snitch/src/CycleCounter.c | 19 +- TargetLibraries/Snitch/src/Div_fp32.c | 89 ++++++ TargetLibraries/Snitch/src/Gemm_fp32.c | 199 +------------ TargetLibraries/Snitch/src/HardSwish.c | 46 +++ TargetLibraries/Snitch/src/Mul_fp32.c | 86 ++++++ TargetLibraries/Snitch/src/RMSNrom_fp32.c | 50 ++++ 45 files changed, 1643 insertions(+), 224 deletions(-) create mode 100644 Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/GatherTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/MatMulTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py create mode 100644 Deeploy/Targets/Snitch/Templates/TransposeTemplate.py create mode 100644 DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz create mode 100644 DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx create mode 100644 DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz create mode 100644 DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz create mode 100644 DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx create mode 100644 DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz create mode 100644 DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz create mode 100644 DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz create mode 100644 DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/network.onnx create mode 100644 DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz create mode 100644 TargetLibraries/Snitch/inc/kernel/Add.h create mode 100644 TargetLibraries/Snitch/inc/kernel/Div.h create mode 100644 TargetLibraries/Snitch/inc/kernel/HardSwish.h create mode 100644 TargetLibraries/Snitch/inc/kernel/Mul.h create mode 100644 TargetLibraries/Snitch/inc/kernel/RMSNrom.h create mode 100644 TargetLibraries/Snitch/src/Add_fp32.c create mode 100644 TargetLibraries/Snitch/src/Div_fp32.c create mode 100644 TargetLibraries/Snitch/src/HardSwish.c create mode 100644 TargetLibraries/Snitch/src/Mul_fp32.c create mode 100644 TargetLibraries/Snitch/src/RMSNrom_fp32.c diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index ec2ed6270f..221a797dab 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -283,6 +283,9 @@ BasicConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, BasicTransformer) ] BasicQuantBindings = [ diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..26dd5746c9 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -709,3 +709,31 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class RMSNormLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + # RMSNorm: square, mean, sqrt, div, mul + size = self.mapper.parser.operatorRepresentation['size'] + lastDimLength = self.mapper.parser.operatorRepresentation['lastDimLength'] + batch_size = size // lastDimLength + + # square + sum + mean + eps + sqrt + div + mul + ops = size + batch_size * lastDimLength + batch_size * 4 + size * 2 + return ops + + +class HardSwishLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + # HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + # Operations: div + add + clip + mul + size = self.mapper.parser.operatorRepresentation['size'] + return size * 4 diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index cf1ba776bd..f0abefd4f6 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -467,23 +467,62 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) - return ret def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: - data_in_1 = ctxt.lookup(node.inputs[0].name) data_in_2 = ctxt.lookup(node.inputs[1].name) data_out = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['data_in_1'] = data_in_1.name self.operatorRepresentation['data_in_2'] = data_in_2.name self.operatorRepresentation['data_out'] = data_out.name - self.operatorRepresentation['size'] = np.prod(data_in_1.shape) + self.operatorRepresentation['size'] = np.prod(data_out.shape) + + # Check if broadcasting is needed + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + need_broadcast = (shape1 != out_shape) or (shape2 != out_shape) + self.operatorRepresentation['need_broadcast'] = need_broadcast + + if need_broadcast: + # Calculate strides for broadcasting + ndim = len(out_shape) + + # Compute strides for input 1 + strides1 = [1] * ndim + for i in range(ndim - 1, -1, -1): + if i < len(shape1) and shape1[i] == out_shape[i]: + if i == ndim - 1: + strides1[i] = 1 + else: + strides1[i] = strides1[i + 1] * shape1[i + 1] if ( + i + 1 < len(shape1) and shape1[i + 1] == out_shape[i + 1]) else strides1[i + 1] + else: + strides1[i] = 0 # Broadcast dimension + + # Compute strides for input 2 + strides2 = [1] * ndim + for i in range(ndim - 1, -1, -1): + if i < len(shape2) and shape2[i] == out_shape[i]: + if i == ndim - 1: + strides2[i] = 1 + else: + strides2[i] = strides2[i + 1] * shape2[i + 1] if ( + i + 1 < len(shape2) and shape2[i + 1] == out_shape[i + 1]) else strides2[i + 1] + else: + strides2[i] = 0 # Broadcast dimension + + self.operatorRepresentation['ndim'] = ndim + self.operatorRepresentation['strides1'] = strides1 + self.operatorRepresentation['strides2'] = strides2 + self.operatorRepresentation['out_shape'] = out_shape return ctxt, True diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..6b3ff546b3 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -102,6 +102,20 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] +class FloatAddChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + + class GatherChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): @@ -610,3 +624,40 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + + +class RMSNormChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + # RMSNorm: square, mean, sqrt, reciprocal, multiply + # Output precision similar to input + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + # RMSNorm output can be signed (depending on input signedness) + if inputs[0]._signed: + return [True] + else: + return [False] + + +class HardSwishChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + if inputs[0]._signed: + return [True] + else: + return [False] diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py index 25b150b553..e442f63038 100644 --- a/Deeploy/Targets/Snitch/Bindings.py +++ b/Deeploy/Targets/Snitch/Bindings.py @@ -11,12 +11,20 @@ from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import iNoNormTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker +from Deeploy.Targets.Generic.Templates import ConcatTemplate, iNoNormTemplate +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, DivChecker, GatherChecker, GEMMChecker, \ + HardSwishChecker, MatMulChecker, MulChecker, ReshapeChecker, RMSNormChecker, RQAddChecker, SoftmaxChecker, \ + TransposeChecker, iNoNormChecker from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \ SnitchSynchCoresPass from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma -from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, FloatMatMulTemplate, GatherTemplate, \ + MatMulTemplate, ReshapeTemplate, RQAddTemplate, TransposeTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates.FloatAddTemplate import referenceTemplate as FloatAddTemplate +from Deeploy.Targets.Snitch.Templates.FloatDivTemplate import referenceTemplate as FloatDivTemplate +from Deeploy.Targets.Snitch.Templates.FloatHardSwishTemplate import referenceTemplate as FloatHardSwishTemplate +from Deeploy.Targets.Snitch.Templates.FloatMulTemplate import referenceTemplate as FloatMulTemplate +from Deeploy.Targets.Snitch.Templates.FloatRMSNormTemplate import referenceTemplate as FloatRMSNormTemplate from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template @@ -45,6 +53,7 @@ ArgumentStructGeneration(), MemoryManagementGeneration("L1"), MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), + MemoryManagementGeneration("L2"), MemoryManagementGeneration() ]) @@ -69,7 +78,18 @@ SnitchAddBindings = [ NodeBinding(AddChecker([PointerClass(_type), PointerClass(_type)], [PointerClass(int32_t)]), AddTemplate.referenceTemplate, TiledTransformer) for _type in [int8_t] +] + [ + # fp32 support + NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAddTemplate, TiledTransformer) +] + +# Basic (non-tiled) FP32 Add Bindings +BasicAddBindings = [ + NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAddTemplate, BasicTransformer) ] + SnitchGemmBindings = [ NodeBinding( GEMMChecker([PointerClass(int8_t), PointerClass(int8_t), @@ -90,3 +110,99 @@ PointerClass(int32_t) ], [PointerClass(int8_t)]), SnitchRqGemm_Template, TiledTransformer) ] + +# RMSNorm Bindings (Tiled) +SnitchRMSNormBindings = [ + NodeBinding(RMSNormChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatRMSNormTemplate, TiledTransformer) +] + +# RMSNorm Bindings (Non-tiled) +BasicRMSNormBindings = [ + NodeBinding(RMSNormChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatRMSNormTemplate, BasicTransformer) +] + +# HardSwish Bindings (Tiled) +SnitchHardSwishBindings = [ + NodeBinding(HardSwishChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatHardSwishTemplate, + TiledTransformer) +] + +# HardSwish Bindings (Non-tiled) +BasicHardSwishBindings = [ + NodeBinding(HardSwishChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatHardSwishTemplate, + BasicTransformer) +] + +# Div Bindings (Tiled) +SnitchDivBindings = [ + NodeBinding(DivChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatDivTemplate, TiledTransformer) +] + +# Div Bindings (Non-tiled) +BasicDivBindings = [ + NodeBinding(DivChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatDivTemplate, BasicTransformer) +] + +# Mul Bindings (Tiled) +SnitchMulBindings = [ + NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMulTemplate, TiledTransformer) +] + +# Mul Bindings (Non-tiled) +BasicMulBindings = [ + NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMulTemplate, BasicTransformer) +] + +# MatMul Bindings (Tiled) +SnitchMatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + MatMulTemplate.referenceTemplate, TiledTransformer), + NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMatMulTemplate.referenceTemplate, TiledTransformer) +] + +# Concat Bindings (Tiled) +SnitchConcatBindings = [ + NodeBinding(ConcatChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int8_t)]), + ConcatTemplate.referenceTemplate, TiledTransformer), + NodeBinding(ConcatChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]), + ConcatTemplate.referenceTemplate, TiledTransformer), + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, TiledTransformer) +] + +# Transpose Bindings (Tiled) +SnitchTransposeBindings = [ + NodeBinding(TransposeChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), TransposeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(TransposeChecker([PointerClass(int32_t)], [PointerClass(int32_t)]), TransposeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(TransposeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + TransposeTemplate.referenceTemplate, TiledTransformer) +] + +# Reshape Bindings (Tiled) +SnitchReshapeBindings = [ + NodeBinding(ReshapeChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), ReshapeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(ReshapeChecker([PointerClass(int32_t)], [PointerClass(int32_t)]), ReshapeTemplate.referenceTemplate, + TiledTransformer), + NodeBinding(ReshapeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), ReshapeTemplate.referenceTemplate, + TiledTransformer) +] + +# Gather Bindings (Tiled) +SnitchGatherBindings = [ + NodeBinding(GatherChecker([PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int8_t)]), + GatherTemplate.referenceTemplate, TiledTransformer), + NodeBinding(GatherChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]), + GatherTemplate.referenceTemplate, TiledTransformer), + NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(int32_t)], [PointerClass(float32_t)]), + GatherTemplate.referenceTemplate, TiledTransformer) +] diff --git a/Deeploy/Targets/Snitch/Parsers.py b/Deeploy/Targets/Snitch/Parsers.py index 0051994686..6976d8d356 100644 --- a/Deeploy/Targets/Snitch/Parsers.py +++ b/Deeploy/Targets/Snitch/Parsers.py @@ -4,10 +4,11 @@ from typing import Tuple +import numpy as np import onnx_graphsurgeon as gs -from Deeploy.DeeployTypes import NetworkContext -from Deeploy.Targets.Generic.Parsers import GEMMParser, RQGEMMParser +from Deeploy.DeeployTypes import NetworkContext, NodeParser +from Deeploy.Targets.Generic.Parsers import AddParser, DivParser, GEMMParser, MulParser, RQGEMMParser class SnitchGEMMParser(GEMMParser): @@ -72,3 +73,262 @@ def parseNodeCtxt(self, return ctxt, False return newCtxt, True + + +class SnitchRMSNormParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if node.op != 'RMSNorm': + return False + if len(node.inputs) != 2 or len(node.outputs) != 1: + return False + eps = node.attrs.get('eps', node.attrs.get('epsilon', 1e-6)) + self.operatorRepresentation['eps'] = f"{float(eps):.10e}f" + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + data_in = ctxt.lookup(node.inputs[0].name) + weight = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['weight'] = weight.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['input_shape'] = list(data_in.shape) + self.operatorRepresentation['weight_shape'] = list(weight.shape) + self.operatorRepresentation['output_shape'] = list(data_out.shape) + self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) + self.operatorRepresentation['lastDimLength'] = int(data_in.shape[-1]) + self.operatorRepresentation['input_ndim'] = len(data_in.shape) + self.operatorRepresentation['weight_ndim'] = len(weight.shape) + + return ctxt, True + + +class HardSwishParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + """Parse HardSwish node.""" + + if node.op != 'HardSwish': + return False + + # Check basic structure: 1 input and 1 output + if len(node.inputs) != 1 or len(node.outputs) != 1: + return False + + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + """Parse HardSwish node with network context.""" + + # Get input and output buffers + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + + # Store buffer names + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + + # Calculate size for memory allocation + self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) + + return ctxt, True + + +class SnitchAddParser(AddParser): + """ + Inherits from GenericAddParser and adds support for Broadcasting. + + Compatibility: + - No broadcasting: Uses the Add_fp32() fast path. + - With broadcasting: Uses the Add_fp32_broadcast() generic version. + """ + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + # Call parent method to retrieve basic information + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return ctxt, False + + # Retrieve shape information + data_in_1 = ctxt.lookup(node.inputs[0].name) + data_in_2 = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + # Correct 'size' to match the output shape (after broadcasting) + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + + # Broadcasting information + self.operatorRepresentation['shape1'] = shape1 + self.operatorRepresentation['shape2'] = shape2 + self.operatorRepresentation['out_shape'] = out_shape + self.operatorRepresentation['ndim'] = len(out_shape) + + # Determine if broadcasting is needed + need_broadcast = (shape1 != shape2) + self.operatorRepresentation['need_broadcast'] = need_broadcast + + if need_broadcast: + strides1, strides2 = self._compute_broadcast_strides(shape1, shape2, out_shape) + self.operatorRepresentation['strides1'] = strides1 + self.operatorRepresentation['strides2'] = strides2 + + return ctxt, True + + def _compute_broadcast_strides(self, shape1, shape2, out_shape): + """ + Calculates strides after broadcasting (following ONNX/NumPy rules). + + Principles: + - Align dimensions from right to left. + - When a dimension is 1, set stride to 0 to achieve the broadcasting effect. + + Example: + shape1=[8,8,8], shape2=[8] + → pad2=[1,1,8] + → strides1=[64,8,1], strides2=[0,0,1] + """ + ndim = len(out_shape) + + # Right-align and pad to the same number of dimensions + pad1 = [1] * (ndim - len(shape1)) + shape1 + pad2 = [1] * (ndim - len(shape2)) + shape2 + + def calc_strides(padded_shape, out_shape): + strides = [] + stride = 1 + for i in range(ndim - 1, -1, -1): + if padded_shape[i] == 1 and out_shape[i] > 1: + strides.insert(0, 0) # Broadcast dimension stride=0 + else: + strides.insert(0, stride) + + # Update stride multiplier only if the current dimension is essentially used (size > 1) + stride *= padded_shape[i] if padded_shape[i] > 1 else 1 + return strides + + strides1 = calc_strides(pad1, out_shape) + strides2 = calc_strides(pad2, out_shape) + + return strides1, strides2 + + +class SnitchDivParser(DivParser): + """ + Snitch-specific Div Parser. + Inherits from Generic DivParser and adds shape/broadcasting information. + """ + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + """ + Extend Generic parser to add shape and broadcasting information. + """ + # Call parent method first + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if not ret: + return ctxt, False + + # Get shape information + data_in_1 = ctxt.lookup(node.inputs[0].name) + data_in_2 = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + # Store shape information + self.operatorRepresentation['shape1'] = shape1 + self.operatorRepresentation['shape2'] = shape2 + self.operatorRepresentation['out_shape'] = out_shape + + # Calculate sizes + self.operatorRepresentation['size1'] = int(np.prod(shape1)) + self.operatorRepresentation['size2'] = int(np.prod(shape2)) + + # Update output size (may differ due to broadcasting) + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + + # Check if scalar broadcasting (input2 is scalar) + self.operatorRepresentation['is_scalar'] = (self.operatorRepresentation['size2'] == 1) + + return ctxt, True + + +class SnitchMulParser(MulParser): + """ + Snitch-specific Mul Parser. + Inherits from Generic MulParser and adds shape/broadcasting information. + """ + + def __init__(self): + super().__init__() + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + """ + Extend Generic parser to add shape and broadcasting information. + """ + # Call parent method first + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if not ret: + return ctxt, False + + # Get shape information + data_in_1 = ctxt.lookup(node.inputs[0].name) + data_in_2 = ctxt.lookup(node.inputs[1].name) + data_out = ctxt.lookup(node.outputs[0].name) + + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + # Store shape information + self.operatorRepresentation['shape1'] = shape1 + self.operatorRepresentation['shape2'] = shape2 + self.operatorRepresentation['out_shape'] = out_shape + + # Calculate sizes + self.operatorRepresentation['size1'] = int(np.prod(shape1)) + self.operatorRepresentation['size2'] = int(np.prod(shape2)) + + # Update output size (may differ due to broadcasting) + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + + # Check if scalar broadcasting (input2 is scalar) + self.operatorRepresentation['is_scalar'] = (self.operatorRepresentation['size2'] == 1) + + return ctxt, True diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py index d62d1c3802..32bf53190f 100644 --- a/Deeploy/Targets/Snitch/Platform.py +++ b/Deeploy/Targets/Snitch/Platform.py @@ -2,46 +2,69 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List +from typing import List, Type import numpy as np +from Deeploy.AbstractDataTypes import Pointer, PointerClass, VoidType from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer -from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicLayerNormBindings, BasicMatMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding -from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \ - ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer -from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ - RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser +from Deeploy.Targets.Generic.Bindings import BasicConcatBindings, BasicGatherBindings, BasicLayerNormBindings, \ + BasicMatMulBindings, BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding, \ + BasicTransposeBindings +from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, DivLayer, GatherLayer, GEMMLayer, HardSwishLayer, \ + LayerNormLayer, MatMulLayer, MulLayer, PadLayer, ReshapeLayer, RMSNormLayer, RQGEMMLayer, RQIntegerDivLayer, \ + SoftmaxLayer, TransposeLayer, iNoNormLayer +from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ + ReshapeParser, RQAddParser, RQIntegerDivParser, SoftmaxParser, TransposeParser, UnsqueezeParser, iLayerNormParser, \ + iNoNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \ IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \ SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.PULPOpen.Platform import RQAddMapper -from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser +from Deeploy.Targets.Snitch.Bindings import BasicDivBindings, BasicHardSwishBindings, BasicMulBindings, \ + BasicRMSNormBindings, SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, SnitchiSoftmaxBindings, \ + SnitchRQAddBindings, SnitchRqGemmBindings +from Deeploy.Targets.Snitch.Parsers import HardSwishParser, SnitchDivParser, SnitchGEMMParser, SnitchMulParser, \ + SnitchRMSNormParser, SnitchRQGEMMParser from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate -from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchGemmTilingReadyBindings, \ - SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, SnitchRQAddTilingReadyBindings, \ - SnitchRqGemmTilingReadyBindings +# ============================================================================= +# Mappers for UNTILED mode (using BasicBindings with BasicTransformer) +# These are used by generateNetwork.py (testRunner_snitch.py) +# ============================================================================= GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings) Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings) +ReshapeMapper = NodeMapper(ReshapeParser(), BasicReshapeBindings) +TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings) +ConcatMapper = NodeMapper(ConcatParser(), BasicConcatBindings) RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding]) -MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings) -GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings) -RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings) -iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) -SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) -iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings) +# These use TiledTransformer but work in both modes (original upstream behavior) +GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmBindings) +RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmBindings) +iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxBindings) +SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxBindings) +iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormBindings) iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings) -RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings) -AddMapper = NodeMapper(AddParser(), SnitchAddTileReadyBindings) +RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddBindings) +AddMapper = NodeMapper(AddParser(), SnitchAddBindings) + +# New operators for microLlama - using BasicBindings for untiled mode +RMSNormMapper = NodeMapper(SnitchRMSNormParser(), BasicRMSNormBindings) +HardSwishMapper = NodeMapper(HardSwishParser(), BasicHardSwishBindings) +MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings) +DivMapper = NodeMapper(SnitchDivParser(), BasicDivBindings) +MulMapper = NodeMapper(SnitchMulParser(), BasicMulBindings) +# ============================================================================= +# SnitchMapping - for UNTILED mode (generateNetwork.py) +# Uses BasicBindings for new operators, TiledTransformer bindings for original ops +# ============================================================================= SnitchMapping = { 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), 'Gather': GatherLayer([GatherMapper]), @@ -56,6 +79,72 @@ 'iLayerNorm': LayerNormLayer([iLayerNormMapper]), 'RequantizedAdd': AddLayer([RQAddMapper]), 'Add': AddLayer([AddMapper]), + 'RMSNorm': RMSNormLayer([RMSNormMapper]), + 'HardSwish': HardSwishLayer([HardSwishMapper]), + 'Div': DivLayer([DivMapper]), + 'Mul': MulLayer([MulMapper]), + 'Reshape': ReshapeLayer([ReshapeMapper]), + 'Transpose': TransposeLayer([TransposeMapper]), + 'Concat': ConcatLayer([ConcatMapper]), +} + +# ============================================================================= +# Import TilingReadyBindings for TILED mode (testMVP.py) +# These will be used by TilerDeployerWrapper +# ============================================================================= +from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchConcatTilingReadyBindings, \ + SnitchDivTilingReadyBindings, SnitchGatherTilingReadyBindings, SnitchGemmTilingReadyBindings, \ + SnitchHardSwishTilingReadyBindings, SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, \ + SnitchMatMulTilingReadyBindings, SnitchMulTilingReadyBindings, SnitchReshapeTilingReadyBindings, \ + SnitchRMSNormTilingReadyBindings, SnitchRQAddTilingReadyBindings, SnitchRqGemmTilingReadyBindings, \ + SnitchTransposeTilingReadyBindings + +# ============================================================================= +# Tiled Mappers - for TILED mode (testMVP.py via TilerDeployerWrapper) +# ============================================================================= +TiledGatherMapper = NodeMapper(GatherParser(), SnitchGatherTilingReadyBindings) +TiledUnsqueezeMapper = NodeMapper(UnsqueezeParser(), SnitchReshapeTilingReadyBindings) +TiledReshapeMapper = NodeMapper(ReshapeParser(), SnitchReshapeTilingReadyBindings) +TiledTransposeMapper = NodeMapper(TransposeParser(), SnitchTransposeTilingReadyBindings) +TiledConcatMapper = NodeMapper(ConcatParser(), SnitchConcatTilingReadyBindings) +TiledMatMulMapper = NodeMapper(MatMulParser(), SnitchMatMulTilingReadyBindings) +TiledRMSNormMapper = NodeMapper(SnitchRMSNormParser(), SnitchRMSNormTilingReadyBindings) +TiledHardSwishMapper = NodeMapper(HardSwishParser(), SnitchHardSwishTilingReadyBindings) +TiledDivMapper = NodeMapper(SnitchDivParser(), SnitchDivTilingReadyBindings) +TiledMulMapper = NodeMapper(SnitchMulParser(), SnitchMulTilingReadyBindings) +TiledGemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings) +TiledRqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings) +TilediSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) +TiledSoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) +TilediNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings) +TiledRQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings) +TiledAddMapper = NodeMapper(AddParser(), SnitchAddTileReadyBindings) + +# ============================================================================= +# SnitchTiledMapping - for TILED mode (testMVP.py) +# Uses TilingReadyBindings for all operators +# ============================================================================= +SnitchTiledMapping = { + 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), + 'Gather': GatherLayer([TiledGatherMapper]), + 'Pad': PadLayer([Pad1DMapper, Pad2DMapper]), + 'Unsqueeze': ReshapeLayer([TiledUnsqueezeMapper]), + 'MatMul': MatMulLayer([TiledMatMulMapper]), + 'Gemm': GEMMLayer([TiledGemmMapper]), + 'RQGemm': RQGEMMLayer([TiledRqGemmMapper]), + 'iSoftmax': SoftmaxLayer([TilediSoftmaxMapper]), + 'Softmax': SoftmaxLayer([TiledSoftmaxMapper]), + 'iNoNorm': iNoNormLayer([TilediNoNormMapper]), + 'iLayerNorm': LayerNormLayer([iLayerNormMapper]), + 'RequantizedAdd': AddLayer([TiledRQAddMapper]), + 'Add': AddLayer([TiledAddMapper]), + 'RMSNorm': RMSNormLayer([TiledRMSNormMapper]), + 'HardSwish': HardSwishLayer([TiledHardSwishMapper]), + 'Div': DivLayer([TiledDivMapper]), + 'Mul': MulLayer([TiledMulMapper]), + 'Reshape': ReshapeLayer([TiledReshapeMapper]), + 'Transpose': TransposeLayer([TiledTransposeMapper]), + 'Concat': ConcatLayer([TiledConcatMapper]), } @@ -105,6 +194,12 @@ class SnitchConstantBuffer(ConstantBuffer): allocTemplate = AllocateTemplate.snitchL2GlobalAllocateTemplate deallocTemplate = FreeTemplate.snitchL2GlobalTemplate + def __init__(self, name: str = '', shape = [1], values = [0]): + super().__init__(name, shape, values) + # Initialize _type with a default value to prevent AttributeError + # The actual type will be set later via annotateType + self._type: Type[Pointer] = PointerClass(VoidType) + def _bufferRepresentation(self): operatorRepresentation = super()._bufferRepresentation() @@ -163,3 +258,21 @@ def __init__(self, transientBuffer = SnitchTransientBuffer, includeList: List[str] = _includeList): super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + + +class SnitchTiledClusterEngine(DeploymentEngine): + + def __init__(self, name: str, Mapping = SnitchTiledMapping, initCode = "", includeList = _includeList) -> None: + super().__init__(name, Mapping, initCode, includeList) + + +class SnitchTiledPlatform(DeploymentPlatform): + + def __init__(self, + engines = [SnitchTiledClusterEngine("SnitchCluster")], + variableBuffer = SnitchVariableBuffer, + constantBuffer = SnitchConstantBuffer, + structBuffer = SnitchStructBuffer, + transientBuffer = SnitchTransientBuffer, + includeList: List[str] = _includeList): + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) diff --git a/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py new file mode 100644 index 0000000000..5c5675c58f --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +# Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatAddTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Always initialize these variables to avoid Mako errors + operatorRepresentation.setdefault('need_broadcast', False) + operatorRepresentation.setdefault('ndim', 0) + operatorRepresentation.setdefault('strides1_str', '{}') + operatorRepresentation.setdefault('strides2_str', '{}') + operatorRepresentation.setdefault('out_shape_str', '{}') + + # If broadcasting is required, generate the stride array strings + if operatorRepresentation['need_broadcast']: + strides1 = operatorRepresentation['strides1'] + strides2 = operatorRepresentation['strides2'] + out_shape = operatorRepresentation['out_shape'] + operatorRepresentation['strides1_str'] = '{' + ', '.join(map(str, strides1)) + '}' + operatorRepresentation['strides2_str'] = '{' + ', '.join(map(str, strides2)) + '}' + operatorRepresentation['out_shape_str'] = '{' + ', '.join(map(str, out_shape)) + '}' + + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _FloatAddTemplate(""" +// Snitch FP32 Add (Name: ${nodeName}, Op: ${nodeOp}) +% if need_broadcast: +{ + uint32_t strides1[${ndim}] = ${strides1_str}; + uint32_t strides2[${ndim}] = ${strides2_str}; + uint32_t out_shape[${ndim}] = ${out_shape_str}; + Add_fp32_broadcast(${data_in_1}, ${data_in_2}, ${data_out}, out_shape, strides1, strides2, ${ndim}, ${size}); +} +% else: +Add_fp32(${data_in_1}, ${data_in_2}, ${data_out}, ${size}); +% endif +""") diff --git a/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py new file mode 100644 index 0000000000..ee35255e24 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py @@ -0,0 +1,49 @@ +# ~/Deeploy/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py + +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatDivTemplate(NodeTemplate): + """Template for FP32 Div operation with dynamic template selection.""" + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # Check if scalar broadcasting + is_scalar = operatorRepresentation.get('is_scalar', False) + + # Dynamically select template based on is_scalar flag + if is_scalar: + # Use scalar broadcasting version + self.templateStr = FloatDivScalarTemplateStr + else: + # Use element-wise version + self.templateStr = FloatDivTemplateStr + + return ctxt, operatorRepresentation, [] + + +# Template for element-wise division +FloatDivTemplateStr = r""" +Div_fp32(${input1}, ${input2}, ${output}, ${size}); +""" + +# Template for scalar broadcasting (optimized) +FloatDivScalarTemplateStr = r""" +{ + float32_t scalar = ${input2}[0]; + Div_fp32_scalar(${input1}, scalar, ${output}, ${size}); +} +""" + +# Create reference template with default (element-wise) +referenceTemplate = FloatDivTemplate(FloatDivTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py new file mode 100644 index 0000000000..1615282437 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatHardSwishTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation["data_in"]) + operatorRepresentation["size"] = int(np.prod(data_in.shape)) + + return ctxt, operatorRepresentation, [] + + +FloatHardSwishTemplateStr = r""" +HardSwish_fp32(${data_in}, ${data_out}, ${size}); +""" + +referenceTemplate = FloatHardSwishTemplate(FloatHardSwishTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py new file mode 100644 index 0000000000..0cd0a649e1 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = NodeTemplate(""" +// Matmul (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { + ${A_type.typeName} ref_${data_out}_${A} = ${A}; + ${B_type.typeName} ref_${data_out}_${B} = ${B}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for(uint32_t i=0; i<${batch}; i++){ + MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; + } +} +""") diff --git a/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py new file mode 100644 index 0000000000..7a970e6411 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatMulTemplate(NodeTemplate): + """Template for FP32 Mul operation with dynamic template selection.""" + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # Check if scalar broadcasting + is_scalar = operatorRepresentation.get('is_scalar', False) + + # Dynamically select template based on is_scalar flag + if is_scalar: + # Use scalar broadcasting version + self.templateStr = FloatMulScalarTemplateStr + else: + # Use element-wise version + self.templateStr = FloatMulTemplateStr + + return ctxt, operatorRepresentation, [] + + +# Template for element-wise multiplication +# Note: MulParser uses A, B, C for input1, input2, output respectively +FloatMulTemplateStr = r""" +Mul_fp32(${A}, ${B}, ${C}, ${size}); +""" + +# Template for scalar broadcasting (optimized) +FloatMulScalarTemplateStr = r""" +{ + float32_t scalar = ${B}[0]; + Mul_fp32_scalar(${A}, scalar, ${C}, ${size}); +} +""" + +# Create reference template with default (element-wise) +referenceTemplate = FloatMulTemplate(FloatMulTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py new file mode 100644 index 0000000000..8ae4d95e01 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatRMSNormTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation["data_in"]) + operatorRepresentation["lastDimLength"] = data_in.shape[-1] + operatorRepresentation["size"] = int(np.prod(data_in.shape)) + + return ctxt, operatorRepresentation, [] + + +FloatRMSNormTemplateStr = r""" +RMSNorm_fp32(${data_in}, ${weight}, ${data_out}, ${size}, ${lastDimLength}, ${eps}); +""" + +referenceTemplate = FloatRMSNormTemplate(FloatRMSNormTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py index 216ff35b9a..a8f32b32e3 100644 --- a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py +++ b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py @@ -25,8 +25,8 @@ def alignToContext(self, ctxt: NetworkContext, FloatSoftmaxTemplateStr = r""" - uint32_t batch_size = ${size} / ${lastDimLength}; - uint32_t compute_num = 1; //snrt_cluster_compute_core_num(); + int32_t batch_size = ${size} / ${lastDimLength}; + int32_t compute_num = 1; //snrt_cluster_compute_core_num(); int32_t ldI = compute_num * ${input_samples}; int32_t batch_offset = ${seq_len} * ${input_samples}; diff --git a/Deeploy/Targets/Snitch/Templates/GatherTemplate.py b/Deeploy/Targets/Snitch/Templates/GatherTemplate.py new file mode 100644 index 0000000000..fa4f6a2a86 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/GatherTemplate.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = NodeTemplate(""" +// Gather (Name: ${nodeName}, Op: ${nodeOp}) +<% +width = int(data_in_type.referencedType.typeWidth/8) +%> +if (snrt_cluster_core_idx() == 0) { +for (uint32_t i=0; i<${batch}; ++i) { + memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width}); +} +} +""") diff --git a/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py new file mode 100644 index 0000000000..bce916ea60 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _MatMulTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + A = ctxt.lookup(operatorRepresentation['A']) + B = ctxt.lookup(operatorRepresentation['B']) + C = ctxt.lookup(operatorRepresentation['data_out']) + operatorRepresentation['A_offset'] = 0 + operatorRepresentation['B_offset'] = 0 + operatorRepresentation['C_offset'] = 0 + if hasattr(A, "_signed") and hasattr(A, "nLevels"): + operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2) + if hasattr(B, "_signed") and hasattr(B, "nLevels"): + operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2) + if hasattr(C, "_signed") and hasattr(C, "nLevels"): + operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2) + + return ctxt, operatorRepresentation, [] + + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = _MatMulTemplate(""" +// MatMul (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { + ${A_type.typeName} ref_${data_out}_${A} = ${A}; + ${B_type.typeName} ref_${data_out}_${B} = ${B}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for(uint32_t i=0;i<${batch};i++){ + MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O}, + ${A_offset}, ${B_offset}, ${C_offset} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; + } +} +""") diff --git a/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py new file mode 100644 index 0000000000..a99573b27b --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer + + +class _SnitchReshapeTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # SCHEREMO: Selectively mark 'indices' dead, since we don't need them + if 'indices' in operatorRepresentation.keys(): + ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False + ctxt.globalObjects[operatorRepresentation['indices']]._live = False + + # Same for "shape" + if "shape" in operatorRepresentation.keys(): + ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False + ctxt.globalObjects[operatorRepresentation["shape"]]._live = False + + bufferIn = ctxt.lookup(operatorRepresentation['data_in']) + assert isinstance(bufferIn, VariableBuffer) + bufferOut = ctxt.lookup(operatorRepresentation['data_out']) + assert isinstance(bufferOut, VariableBuffer) + + # Link aliases to each buffer + bufferIn.aliases.add(bufferOut.name) + bufferOut.aliases.add(bufferIn.name) + + return ctxt, operatorRepresentation, [] + + +# Use snrt_cluster_core_idx() == 0 instead of SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = _SnitchReshapeTemplate(""" +// Reshape (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { ${data_out} = ${data_in}; } +""") diff --git a/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py new file mode 100644 index 0000000000..5e33f85aa0 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency +referenceTemplate = NodeTemplate(""" +// Transpose ${data_in_shape} -> ${data_out_shape} (Name: ${nodeName}, Op: ${nodeOp}) +if (snrt_cluster_core_idx() == 0) { +${data_out_type.typeName} dummy_${data_out} = ${data_out}; +<% + dimStr = '' + accessStr = '' + shapeStr = '' + for dim in data_in_shape: + dimStr += '['+str(dim)+']' +%> +% for idx, i in enumerate(perm[:-1]): +<% + shapeStr += '['+str(data_in_shape[idx+1])+']' +%> +% endfor +% for idx, i in enumerate(perm): +<% + shape = data_out_shape[idx] + accessStr += '[i_'+str(idx)+']' +%> +for(uint32_t i_${i} = 0; i_${i}<${shape}; i_${i}++){ +% endfor +*dummy_${data_out}++ = ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr}; +% for idx, i in enumerate(perm): +} +% endfor +} +""") diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz new file mode 100644 index 0000000000000000000000000000000000000000..eec4cee60044fd83c5f2d6d4605d95c29ba9fa80 GIT binary patch literal 1288 zcmbW1X;70_6o$XBii%Zs8KF?cu(=}?A@|%O#Eb+)QE;KrK(Hx7z$D0`r7++yBGn3N z1;HvcDl+I;rdloe&K0ml9F=86gyI5K7#$EPqM^%_=uH3hPv5i6dCvL$MunT2*>UgX z&WU!u?#wmgIAd9HcHHNxv~=x4Ra%ySoB0kJAB{(&N+UN!L~RmqySUv#MWQA_EnMLy z4BqZ1^mP+%PgQHxa@E#UwIb2T#qwlL;w0B7rscbuU{scd0Z)s-J}d>@wJLnw^oGcuY1y2X zIA%L~5H+U-%<`!lglY#-_Ol<8mj}^-B?J5l=~Oab=Fe*4_G7$H0%?f0WjR|Yq&Jpf z-$r{>Ta}~MF@G%Di%11=E)D~cx;SSE%}nluXcXf11_`FG4`Ozej$l=74mS@7gDV0?_=n!h zS%)kZS7k23Y4Nww-c!e4e-sHts-3j3>3gE)KQ_s9X(I zdz{!a+jF2VFyOYFoA9gDj*6VxvF+P&u-F(4`_%2=(xRpotN!4-#~6Nmr5|5)wPNmo z3!hc95O$om21!E)^qb!cR$8y1&Uuzd9WT&&pOa9l>Oq066+3B>0jiGQczyF_@RO<` zT4>K_{nCX6&pu%ll?ogS?g2$m50Pm<;Zim!wJ(nmjD(mfEnvMFCKH z@E|0gno9rdv!I!=i&=#I2yT9m57~wJ;N6`M!qRd2#G?cPHqWBF0>|L+^)v7`$sO;8 zH^AfjH}G)oC|{D7iKl9XL^k&@EjCGIc;O;sr9`qBr*>j}{4`2*rljb1E9T*P1zjg< zVU6J$t~=XHimI=PBzMoC+buztSNh--Ukx6mXHhvJr;VG|fKK7gIKy*LwiVKLp&7lB9l{ESGB7G!U}9(bU*`4R cql`UQVC?@|SyZ^Wg%JoQBVcm=^v=%y0ppY&w*UYD literal 0 HcmV?d00001 diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx new file mode 100644 index 0000000000..7a146e5541 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx @@ -0,0 +1,14 @@ + +hardswish_test_fp32: +* +inputoutputHardSwish_node" HardSwishhardswish_graph_fp32Z +input + + + +€b +output + + + +€B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz new file mode 100644 index 0000000000000000000000000000000000000000..074c937f5ba8e460f806d0b50d526ce4c81ba304 GIT binary patch literal 1290 zcmbVMdr*{B6#rn6ltM)*)C3j@R$d8ekivJ*4G|H90TjY8bQe~bcrGly2xbkCK;(@X ztUTl)3K9&VqThE`$v|wCd_V{TCI*u;Wg4TT&RF|x&Gc9Q^t<=Wz4v#|{oQlUA163K zWMt0#4IAd!yhj&RMhqjE31iMArz=y^mG35{>^EQ*>RG{4xD`+y6c!k~-GJG{WJqG< zY0;^Y^r-CX^iyMwbh zQ~ZBm;^pH29-lMl9GeJ@zk5^D2!DJl%paR<{VB<;fMVMwRIOkkVYUG;ebR`26U9Uq zwo0(M&%3Zaz6Ix3^^%$W=g@4~DL!4fjz-IU@XbRdY>=#sKUvy|_h)C=RLgwy%YRJj zW#{pAbqdC1okyibA~my*zc*7)ym@YU1DR~=+@6l2I zN%9`lCZ6H)J2Y4t-+;^7roqu`MDIWE>HxWSxC}pXK|HG10oiR6;NyN7_WAL6c1H%f zE*vCk^BeHvvt~@Iw8v{L71(9I50`#e2hE%8=rMUA-xECyw%RFjlW?dwK1E~`KGcF; zz!P6TD&%W-_XJ34IM&-f9W4_2S%HgUDBA6uBm;e(lxsbM#zTk>v!^lT37c-lpTU{G1amO6nmE4@~$$ z#~jQWXXurz5*Tf&0-J5;Fp_tnFm{oT7KK2fYatDxZRncuH9l#)%kKT46$QQ9LuG7C zcMbOBSVQlcvq)on>8@H0@|g!wTU7^zb}}rrIz%29EvJINclb79WT_^nJ&(Z>aTHdM z*kI7|wGiuYmSnb8k_+2M$%<`=d$#72pQF0SOo;}&ZYEJhjWw(?&A{L$1>CvPj`8X* zAvQro6#<9|pIrm#C1ZI0-~?D+OQrQrp?LL`23JM)U{Nk&$3P_?Djz2AXit)OpGK&h z2>0rA@}S;sJCRzn)A1KFs5R%%WmJu7))opmZ+v6w5NM;UA>rW>e(QBkpX+pS9d20L z2E(}r$i(^x>>b=lu0EY5A(K7G6uOW*ZF||p{)h3>{YfJ8+z8BtN3f=O3LY+<<|CeV zq1MU`zNv5pN3k17%Ae7XzugQkqyL02`ugFOQ!*%DW?^Yy863E!!AOrSxNk`d*4^n~ zqiiemwNt&O0-=W4)lIm*{4ppu=d<61Rpa+ZH8`OC6hbF7*xisxLc=KgTPKH7_k4PJ ze7F8h!KrXEH-bk$t5y)nG+1wNmse?>VWh7eel1VOqD8s5(xn+=2RV#;A*IJoiy-e- zARgJpL)^_gT5WYtuOt5HBnAf<7@CXzt9$7p{0AxnVe@Y<3l5lLEHDPT@xE^5^!@BF Ds=E}U literal 0 HcmV?d00001 diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz new file mode 100644 index 0000000000000000000000000000000000000000..9d14ca82f70c79c9e431ba6db56c7c19d808c6b6 GIT binary patch literal 2312 zcmbVOYgo+t7M_|eB19C9$YmNtu~Dk||JEO=sK!);Zn`p}%|KA#^ByuT_TMiPjQ>j!cPUpyZp7ZT|Iq$kX>s{;HS|47CJzs4C??0Hq3%huR zm5<=@hC_=tfwwJK79M62EQ?a%>HHvv7l((#ND;4hlx$MrMet&T{sEzWAwo+-p=iqz z;e12kmYpGCAwI#IcZT=}4D&WVJ3<4#^Py57S-|&x&a(N2!#;Pu;U2^P4 z(e&dz0pxa9!27D5G~>ojnm4N!PDWTmw^t`Me>V~)u4<<5#-2d0oRok5TAr+{{$Cq~Kn6B?-Sd8Q*M7 zqaj*TFigK4cFposjQ2}K@iAw}v_6F!Uau#Sj|Y*bdI5*hRuh3X1I96?MDMmU z$Sfc(Tokyt_EJ9sLVZsiLI;BkyrLojn~WkD6`jIWhRVTxP8m6BB*3R5%82jFN^qg$ zIbu@-?+b*=B~oWlth&nV)z6^1?d8mQ=@1rndDHMG*T`984%~~>P&Ob&qfj2+*$@ch zoX-=_Y)#x3z=snLs+nm6yFgjC7}wv-r&}s&X!yEvoRF1HvwJ(909%Y>K;i7?t;Y1pGe+6;>eU;Cpd?by?AEvGeu8H1XGb`z}gI2 z(?I`|q+`Jtj-;ody4^7--pj)H*fo${C&kLcCD6FD1g%AS5VTO6txeg3&HXy4rgxXA zY3AYV-PMSJ$;{D#MEogTtiB1WFgApuV0To}J%DF4zL4Msf6CRE!4ijV z;v!7rqHb-(_?8=ZD(fxn5xIlEl{@hYJB=G$7ZGQ>DiUMuiw1!P#Ck_C^W5PWelRU1 zDM!Y_j+`X;_GUjsZn%t&6J_Z3j1SR?e3A@o>mKg}u1XS`-jV3SopAzm_tsGxf|CZ~%kd=CQ2i(`^LA3(u0bDTD~ z0M-szqI13{Nh+{p?Cu+(kC_%5Z#NSd*P0R--Aa*WfsNMlimW|nQBUt{+0X!7+ zH`<%22gWat5Ye{j$eXsFW(R(wr(Wyf{(YN3QG1Q-_g#bw9JYX`YZfWjmw>zJ3{bOJ zhZ@qccvcKxxqK?rTNElbtnOs?C!GU}pY<_ij5ef?T1)HSd(rE;$*^1O0>z}%OyIfwZ|D}ABZCHPos(E zEfxG^+(qTtLFAEJ8CDDbz`l>>`1Yt9wKY9WqMj-s?T9svUJy9vMK~C1R0uWDV2byb6E2T_ZX> zFhq2*462WX1Fh3y-^#2}D%lLG)QXe~wApiQV&wO`;@zJo1+_Mr}zu!tn zaDhBH1!$C94Gk@AjM?YoFvB*9v2W(Hs>QD%ymK#oeLWI9%mv_VS_~HIyKsDtDTGBQ zg7>s0R0~hXnqO3vS3Vt}&n}yT$WaL@n9f8+Jb-!mmr*w2Iets<<1BM4Nk^+XHy}|{ z9*w`oHTM6G1v`zQ@@hHh%ambFG9TBbCSr@HoLd?n1_w9p#zk5x@P%q(>A)IP{M5+R z{}PXV<)cw&T?QKL{tEFj9mZ_NVmxHvPFU?0u5pzw)wDDKjke{`c25%|30ZPxId0Rw zhII)7oF_}d6+KTtP~roZPOB>J2XkN(n$1k#QJIQzp2nlQ&3)BbfC z2POwW<<=fpsaAr@X*v-1n-R*T19V2dKFUozF^k$TMrG^JDJqFXS37bNdll6Q{Qo6h fen^xbNv<+n{wrl9_UanLgvxg_|6a>~$eDiwp4K0r literal 0 HcmV?d00001 diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx new file mode 100644 index 0000000000000000000000000000000000000000..25a7a9b683baa30af58fa435dbcca1a61f239f2f GIT binary patch literal 220 zcmdqD=;>)I>5+f&&8UVS5R8Q#a5o0nVwN1#Fk%L0_02a z1o;O0ft2Uvr=%*eLwQ=fT+FEj#iD!7+e{ZQGFmYR3Bim_FG?)P0Gq3&$H5`M=ma&< pOM!u*!5$xul7>1$NQg^-gHecwi;05+hLdEtpiaUj>%=6$3jpPkG%ElA literal 0 HcmV?d00001 diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz new file mode 100644 index 0000000000000000000000000000000000000000..6167f740420ebf0e05c0ef307e4b6e7bd7d8246e GIT binary patch literal 2314 zcmbVOX*|{W8b9_WO&BIMh?~76Yts4u9-l}~QX*+mBeE36mZUMZnrxliuG3;jD3Zvu zU5ayl&k<9ON?BVl(qtKH5-pZ4GcWF|`{sV1?elz}=gsrt`Fy@EP7;zTtiPen>Yont zN|0r-=0l#P!U{bU8FnagRY=$oF_z+A?EL2ZbRN7d?#?dXiLs(sG5WrK2lqwj+i2;t z{VnuYYU%rjMnpz;Hp%YqTP$ zbZAkw4;%7gJE71rg_@hE&`rfXaKOq7!X(G&bNdAl-aky8)kf)w&}cH1K7q@8I&qb| zB39cZppH&1;Sc(uWW!DToPHNxm-!_mv zyh)5&5{3l@OK`{L>(FBKfQ&?cLC*6kveUeao;J3}VS7{f$?pt!yi0=nlWCaV@QD8F zKs%{U&4;?{nz(W75S=ltz+b<U~;f2+_aGO=j-aX|zZgBc3BIWXEACcD4u76I&F8 zU$`sqPVE8=&b)+0Rz}p#vXx9XeFPC>P3-%UiRnLb1Yyf&Q1@jQfA#WxjIyqht+J;j zZPZOCoIY`ZR%I%xl_f#EjVX*I=|OlnpxqT7JWuDL@S6{$_f#*`CAlCC8KGtJZH%|E z47M9R#m)9SrZ77lTer-TnwL@dSm`MwIlJQKccoPD!!EESg_yP}lJH}Gg@(t^@QITH zX^6W)^x}@gDvM@xit0ug#tfQT>xryt2{FGgMAWYloFYfbrdh{SEZJ0#(j zWD&ZmS;M}DAS@0d5UdYVPvbOT zpto8O+t0;k%G0FMWA#P`cNvplqFUP7tW1{frN58m5VFw6oW-WrTU4YLUN zq;nKA9J9z(OKoPqq9VGB4MU;60@!zT;i4WL^43Ke6ArUMWR-yncN$V1cN-|Wn+pmN zohb8il6rv&rYMDDSx*VrSLTq^)I(@3FoWj21_=6!1ENtL%4PEDS9=0M`A>0tWnMsB zHZOwg=oKhuvRa_it19e{XYgXH7mc?`L(jI`pb~Ncg){ePT~#0Rpg0#?6>~ADc@&Py zCbQMN>M&k)3;3!OYZGp>M?;%GpCMYHY zWO0%>EY3TH`QKI3rDA@l_BsW%-F}1MHeJXX$-|pUg<#3;CY$unfVE-(XlP_&HP0Cx zm!-nctQX93zwisTnv>@$@4-yk2X$q)fqP;WWqVwv?EXebzMTc@j$}e=*?YM7Zi@Xb zKOUyC`)RSvI0;*BM>SsGWMa(a;6^SR^NhzJYajqWjGsUz_BiZLu!6zYiJ(w>6n@jV z#LuX##JmI3Ov4ZtKaA8vZfhjnI8}o+zo=oNl06)2J&wyPYH(LzAMwiXCldb-Lt)@7 z{l4T!(v?z!6$%vFrPkwUhCkgqTtt3Z^8h${2GCuVVYK!+?Gn`W8Ch&ixwyTO63YM(*rC&;C8OikUdF7G+KO>{p*hh?-g% zh^_F48wO&whMLy6?Sds(TvsCxCjNl;90P1@S%viMC%R`zC&~@Kr2A|=Fp+0ORzImG zhF>IYKhY$(bb2{nqj6XjvjF-de?|r?fLk04jvPKUdsTuxEy-}KvqG?DZ!}z~lM=QV zaWHK|_qt}zPmI)8i?MORHV8Mog)zE7OH!oCOv@tZo^Svo>m1aNYQvuMH=(}iF8N@; z4FfgvL5~QTPt%d`XV6||MxhNR-O6Ee_ZBK_<$!PbEZHNU3_5`X6?|l%pwk7kQm&zN z!U5E<6k*Iv1g(Ck4ZhyRaP`V@CO)SEnlrO8Z0?u*0~`gOwIY(VUrx|nBO{D-s1k^~ zm0(t%30R*Kk|W(wsGYYO)#b`jW>=}eU#|cr1By}7));)&TcNpK3c8o~3hqRoz(0&t zvDZ5v-|m#OZIV`FSZQpO%kw9BBNKweo^WceY5=WUw?cx65*|LQf$iM0v_4Kl7&(!E z)g}8uQH%?jrwnO>V=WmNaw82=61JOo1lZP6)K3(RGTM!JsmG19582~C_3mPahXE=- z;^Iu12qZ+V&^0w)Wj=5n>Ob=sok3^J{kj}g8{ZGM^K~ Y=JS8GjEj?$^gJUrXV!C-|CgTm58whcCIA2c literal 0 HcmV?d00001 diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz new file mode 100644 index 0000000000000000000000000000000000000000..d077979636ff7de35c03b12c1df7ae2e35a19937 GIT binary patch literal 355738 zcmeEP3w%_?)xSYN#HtaaQhY^4#TIq5d6!253Q`ptsao`_ZXyXJnittb(JH&Op!lk& zsI;|`QdCs1F-4>nvLOY;C)iqRty-j1vDKDRwWvJ4|GD>0n7MoJ-pt-5?7FkRJ-@kk z&b;n9XU@!=d6k`iK;JCQ_?@VI{Lu~1hch&dV1$;X1p?D+15H(pwG~sUDg%L<`sS)> zRZW4Wx?p`nQ(d63A{Y$#0yXTtp{2R8r8(d`v%c~A3@uYnhLWb_=y%_RmrN|XEJM3i z`@w0IRlzAur+q8?w2@QuP8*(m+SG=o=BA4JKtoey75R^^s0~)ZKUiJSScUgtIm5H@ zFUUK6c=k=%as39)y<|n?w8Dat(bLW!b;y|A9m~Rxuln+hgG<&=tfS|W7uz}_^>+>K zn0&>LB0u}MYc%{u)!#M0!+1V@^yxo6aMb3rAJK7#E&5K$=fQ84{4evv$RT59M($m4 zTja&I(Gi3hJ$l+pqi3%=qlEl6pM6Tnr?>Ynz4VKm$oh#NmpnH670SQk^q<~3ddjDF zmGp0TK#zaT^PhJ7=<@H6Mwm$Zag$3*Up%&CWZy3$U%qii)#IL4xV+?==UXVP2+^L>C=;0d?CJ5hH1nMf9>so39$h+uUYhZCe!Fg6Wg1r^yV{=(;JM; zDHxs|Z|giTo%J+*yI6mZx^m|IjFCqXBh3~!`T6*4%{wJyj$GdPXv?n7!}qM{Ot*Ju zKFaCgyk6JCo~m!UvDx~)BHPyt`CuulzF(*6OCy_LV?(ei;NSO}7-|>;rMNY*_0!Vr z-I1B5o^iZKE`8*y!l= zv*G`cnZ`Do_}hS|^36pW{Y#?b&p_Pu07`$QP5N`;eFl)0-v@sUWp5pgJZ$h9(%*_OMB^Pc_{qSXgY@#uI`~Vscd@A`;mi!`9vlvt`QYD%ehBHIT^4X-|KBtwu}50Q{_H^qh$vibl_gT3BZA zG46r>?LXe!`N1s*oPF1@J4?vD_J#hXEX8iBNQS3dHt?j zP3{9*{-wuTIenS#-c$J}J=}puU#`1X9g(BEKY3@R?)o0QTo1Qo+3-?|?@Y(>)Ge`H z#YYRfqE>&nbVnJznVSf6H|DCoB6QFD+acR@%0Md5Wpr&r{}hI{yF2hvA-)AUqbHD4|nM9?JZAr z(0k;ri%ZFEFZn+`+(Dn^Md*Fh?qgT$?njID@Egtw>F&4-=j-9Z1w(XqZ0#3%_>1qH zt-qge&k#M_>7zf1I)i^kc3ky8DO6X|wHKSWL!S{pT!dG^Ju|tYMz+L}Psv+b zB}K?(RScsya~0kBs=H?JHLvTGLGu)2aeuI3YI9x145lk<*n6@!NnLtv4VOIsDLsTs z0-sUmFkCXf%y@r$%-8hyUmak)Z!R=kW6mgCa>!yM{J>np{hZK$&w9*#2MX72tL^5mjBZ;6%J)ZkN-EXi+lk8OCDnPl zTuNgb!t}wuZ?S>I`aIz1_YI`exhfieJk|ha1Gf>Yb|;UH#y2AUBBXsh!eraTr}Sy% z@iK&2VN-ruRjdJ;kpHxYj(!cZD^@vj8V2bhESJ8a_5L;Oy}9bRHbf3ZWs%kiNw z;hD4j1^KzjyCj-qPsy*6Urjqv(s36R3-?f9}uSwh5VO zgWMFFHuxRtN7{l|Z+UjhOdI5;%Cte+@|k4Znb+=%gHrGOBgJZ%4Cr^F=D&Z?$<$#Y zknB+x_i~TgW9zMHXTln?1lEvFSggo;M3$3Tc;5s!8&;4!;AxNj4&cbLlno0)rup28 zXR>r1i!g*+184($AMe``rVrd@aLMXW3jaj_@_PvG%o6Nv2d*6MSO8gFH2C+&^Pv$V zel_r9?J39m%fQV8)Z)1>U_Edb!lknLO0X9m&t!R`bjZ5%HM|c-n32Gd#fjpQRc9;w zUIx%_eTP`ZZ@#Pk72-L;5o}SGjad?C`cW|Yd7f49$i&ZIM0-2|&#ct$=F){|7h^KzKL0t~ z0|0{nA;1)X5AYm-;?D(41^f+A3b+vPA>bLnBY?L6RJLOPe*{zlD4mM{zX99=I1+F) zfZ`qpSPFO)K>ZM;;mkz2%Ii-xrLFkRojKd}_NaSV<&Ci{jh!)To>G`HXU_h!FdVsJ zRVZ@qC!xrTnc>LUdqR=HFNGo-`h_F+-X4m4Q5=qx_X$V3P7OzX|7IvM=Rcvyj|Ujf zPhy+Eqo0N%t8$F@>}Nue-M3)Cd z3PrwlUMOuS2tNa-&_7UHFY zv#ygIzqzf?ReDe9y&m*2(nW81{myvnyIa2LlCG%te)~l3IW+ntaPH$_t9@-t_nx}y zil)lobv422==uI6({V;`h4(Ha=`!?mla+Ys;H>VP zhqTc*gXXDe3uUHKrpkRR1M$}JeWg26EnR0_Cpn~bIgN}BWtqFbhqP{dQh4wE5Jkg% zM?urpH|8gUS9RTb94>xb0_aLaphO=E2huD+{jsK+t2 zw%5KV*_6f5MTZnBP&7zY88`{@%fRg$SU7T3Sc7iPdQP%`=W-Ay>(?=NfA{a)_^CcC@!+M|(SVjkjY*dpmZtw_`_pJ9f0| z&)Cu4jvejo*wL=PV@G>CcC@!+M|(SVq_<=DdOLQsw_`_pJ9f0UV@JF0haK(h*wNmO z9qsMd(cX?7?d{l!-i{sZ?by-YjveiKeC%j%$By=P>}YStj`nu!N^i%G_IB)OZ^v%* zcI;@^%Z458?by-YjveiKI@rFcVn}DN7>4?>BtE}~xu$nf+#J7!##+Tzr(`p=F znhCf(5{;)LPzL}v7%E;%HR*ZL-Nf8}ZLX{u;{H*%6(-9uaADuGF+voA`v^ zhP>%0*sV74Cj&PR@h1ajl-Q(?bm@pz`VFwb&qcg#C?l26EC-W_ zr$&rpYggLf!-!91n+3SkCV!$oeV+!mb7ge;L_hj=qR-%{?{}7Cy7znU-F1cjyyU$% zh*+aUScx#5j(yMN%}q=dTY6JP(Y>jCppEl}XZzzkSyR7*n`$tLGYyPTYe&-Hh%^ih zkVw!C{16MiZIpp8*EJw%WFvm&Xe0jR5d%Nh#1Am>gH8NY6Q8MT2-3(#d`&arZ!+`G zHshaZ=C7IM>+BHAr{P^M|4Ji%tEK$g%>2V<{H;nw84<>Pa+2Ru=w!()}872*9i2Xg=zpXbf%g)AL?7 zgO70!_domf8C_fcbKUsQ0(rXo=?Q-vL+?BG95mK&pDfYw&$g~6cl(}N9lveQwus@s zSoh!Yv!CnnxBsP4cR%^%B;&o|e|q>&Lg(w@J~{9;BmBRQ(fyt2D5nbZT>7Mp-po~W z=erWG%qjm=rwpX>T((EOxv8b9vgTU0N4>79zS%fex2~bGrM998#}ZWraM$34)~aj>eY=#xivrIR-hGFOOLNai%v?okUdCs)EnhMRikCoAx46)qQluYqJ<;7i zJWk6kwgnhJ-Goc!aTc}ioet60Co|z;uBVU5F)qP|6L$lZy!BO5h+G!PFnTjr(Vefn z+n4&5>6AeO6=S7;uwiO*UBwKenaT;&u=iwdl&H31iqxFq?R{6&6g;sdh1}&N2PWLi z-QPlVkFPV(9r+=kvKzOS<}-Cn;bkEYZWVF zeEFzoJbewVaDE+W%)mM_Yy3{cs|GF`VM=ZAp8#J5yau3kV%GR!tn3{CTr0xSx<4zv z5xB*GVOV9n(k4EwPM(FhrLl7KQu*^+B#r1UJ9WG;7o6aEWV4mO3 z^P#rD{iX6Bn6q%M0j-VGFdGO0n({nMNha&uG2t)5(pNAqBBdwkN!jTgYz6Nl1pKzoZ;T8bD1UT~h z0PgR%g(61-cl-8GWb7xQNC0Wxwks5A|6eGw8eu5@Whl>k@Eg)U9Jv)~pNMib;rTk? z?n2xLkPnsZyiY@s3-J6H!cscdBV0Seti$t9k@o8ViaQJ8`oRBZh#Nq;DGjlb|Ka-s zFA1C1xk-(*{Ej27B)isa4`w$MC>o@y3}k5GE(UJjz`~KM!WwjQ)_2a0n~kNxZ@)}Y zDLYp|a}z{`SIU{mfKp(FEW}F(XJscjf^%D@tMs1Id#YodFd2xqzB9e)DNr;>RT;>I{4#L+1{RK7 z71p4ev#yiuO%uT@5sz1LV?&AOky76D(-Rp>=(f*^6(q#tiYb9EHJqHqvPXWMc36HtZQMMH~%o8^W~#-wGW0b&`vH z?HiGw5Bu7)vEO_f((xgjhJEd=2t)BT;LG9XLwXwGYy!;1a~beuz_(&AJC&^s@yqaB zim;SUCY23g%2CcT;2HrGcO%@5h*u8qAsnS4>S>;dl7r6_JCs84ov*y{>}$_c``S}; zU;DfB4|$JiYN?S{eyWVL_!)O(_7q?yPUc9j zcV|9IDq^>FT8X!R<>1)txbLh;H$GdxSLFMdkKc9^tG`2Nol0|4MSZZbAy^g2@kg&k zNi;7#o_}urwsd=U=A~pEE=aBWgVK%9*6$Vh{?E_8w~B67A>i*1vN@(R>F!^ldwVi* z|BQxvdKx>d-wA)*JF^2}=nk4T8$8{8Q;qm^$IZ4;(eXP+(!D^qn+WcN(b4!Y;?ey) zM!0DFCg8JhHxJz-G}$J9@~3a0`*AvL()SS#X_QkwHu1}W--x)|05c=e`9FlR)#DDH z6$p2!4W9BZN8D^co89*VpN%vqjJ7H|efsXsQ3m}M+mwGBa4Qj?ezR=wVc-X%eDu5A zrhJqy@kRrbM&<7;$A_aW+S?Qr78fS(6+%jcl?dzdubrrsZFpo)*i^;V?o{tw=a%Q; zN_%{TuA)dK)6fX-D`|K{s+opJc-N~isbm9RI?BMenfOvOezh6D&BSkmOYu8Lk_sY~ zY{c&zX~ds^@PrTRDvVUJh^O#`-(=#mA_hKemT$6&53e-R*YHl|FI{D%--+}nzRxTl zm7DP8X8C5A@ykv8M#LlhHZ%TAmWgNd8)%k~;V(DKS8no;(xvn@ zxKw_k)66bW9rK1*DZ_h#GQ96S=#w{{B-^tyzoiWC1123M1wN*iq1`&@cIKm`B6d6VE78VU z&-GXFJqulK`kko7GQ8*PdtIyy@17?A#5#P{uRlLtquK`0XS2<#3vr2|bvLgbj3mIU zoSt`i7>xACc^KcoKAm@#_S=5l_>2{!#_I0eK3(MY(H`t1_W&k}-tO*tAB+I}=h)85X+q zN*TSGtLV-zsGPO?^?&P>fmEK$cB?lxwNzEsT+4Q=*HzUw8)xj+HB`3LRy5&ih01^* z6NW8S<~5!aI@tyOldqjy`r)Lr#;-ZG|5$RLTC`#{xo_QZNhi63X3o>yM_)Zs$DcQ9 zppJj;+Y|J7kB+=V_b>E6Gls%XxTLg%-1EOUR*(1X7drItp+CN+yWbhutcN?R;0_%x z3OrAE_njs#vvf?hnY+JbJh@0cyKVWBIZ(U=lDfsEqeUswkGY=c?jIhfD$i!WfktUG z%~{mCcRUoR`WknAl@ubEWiqVNGS`z38xRue9H zYQiO*CS2TZa57B^;Y67f)Pzg-HI2;nF5`-gDL>&dn1imH^Xo;!m&~cLLU_iS=32hGUuX9N@!% zm$6RGioX;7rC5gB2Dc6Cz^wS=G5jnAZZ3e<=~?kbqp<`SxN5*WoBZbkPs@SV15fMB ztntHGYWyZ}tpE+{_^fzZDm)P1&q2FNO zmm&THxTR6)i)|4v$A_XtM)c-&x{K6dv~gt2(@QFvQ!PMH59YXk%q;N?@&g zzINqf%tRrzWuq{KW)Rk*g-u%!TVI~v&eNh;sDl~NDb0b}ntJn=?7~!;K}cIhlaMv6?uV|0Y!QTzv67OI0SQ}{sB9lL(Lg|*+^YqScZ)@V@>`Z3qDZPH`q zEqkNiy3*hGTNnOAzjbq7@3*cK?p(YN!1LhO`mL)5eiPyT2p9g1cpe5EJx?HizyJz^ z_e?+$V8EaHt($@HtpE)$8DZXpe;I)M%E{f(Z{1ePV_C{ofH3mhh%gjC47`v0 zk)DP)jerSwUJTqYz(%udZzBF;JZB>;r85WN)&uWD{Kden08remaI485FdyM44N*_? zOq3fE7p2&lc8c%(BI;Wf9_(k8*SRwmXsQd=H#F559kKk>IhD4tPdGAUS12-bPbl(R zEgaba_t$tIgXb+@gd(>Be+-`6GQyDyc84MX;LGt&&qv_-F2FYthTc1Nh9UzIH~WBa zV(JUq25;{7Cazgn9DIP~=niQ~YJ{`#b#3 zLV633_O*z21)etmcQN1}X4z^`&NX;mhp?2+B82-F@YC>o2ynvz6n75Xd*T0mz!H>+ z(hw{8aW7p{62{GJ1X3fdV#kqIlHK!$Z^OX=3KR`eQwDfDyc-WyScAtr9h~)@bK_=X zX(+b0G$>`Kl&fl75+(zD#Z}7AnB0+}gR`=e9KpG*)m3^gVKVQ=UxiQ2j++eMl-`T( zxVh1frK!n5c?W_uwJqIS?5Zo8DudV61goR1 zlu4%RaGB*Y1&Ri#Dg(KYUj}a9z`~KM!WwjQ)^(D7JeT7DNPFtDGp3t>*%^O+LENVT z40Pka!n@ZIMWg*kBc;m>+Q&-=QSZe)aZ#iuF24HXDfIo-_Mcn91ce<6VUt|S-x+TW)SDXd_2NZI=3L)I^fSl{6~RX4xqU2!<|b0fCmtc z(hw^z&qT?=XNnz4q4>^M-gx%47pQ&hskyJ6KMPK3qy=+&MYF9%J;^%X{A@}(K8s47 z=eN{I%byY>t#o8C|INC&ywVe9D}I2Zj4!wHbiFL@4?)WeTCMsxaL| z@X^~PLK-f&X~P9G+Dc8!i&*JkT%gkj7u?WAKy=BFNGHU_!GuWp5l>Zuh z<6O9dY~mB$=|_0Fq{&)7`mVW1oAN1LWyL$o@!?oC+1yl7A8c$0Rt1U*eaU-|kcQ7w zBCJGM!$H~ajXGg^*i^;VYe;P7m-c}=&hv1`I;yE&MM<8idBXcjYNALrQ!{1UQ#OB$ zruD%);Y&LV`~3L}+l=5ONXoA^#Mei;4~UmIzppNa64KHYOh-{(Vl2;XVqU+*yR zttLL(%)ix4pXfyKv(5C0E)>7q#AloF>7F}EKg&!Xl+@`v4DUL9bQMY}*@&NQ#vg2! zpXI*>@09-BQAYehX8caQiNJRuT^>*0LFpSRnpCoZXWw6n_nBRyI_6EWQl|G@WqOa_ zz7nrpCZ2fC{FXAk=PJ{?2ZLHXjEg6ZGqWYfioRaHaMt6Br+8@fRf;3>-P!c+SElzQ zuf-xKd5f)JdQSyLio-}VOQyKnw-weHS+ zRrk+$^HJly^nKmUxN}Yy;WGvwWVqj&W5nxxu~d&&`ic>6@_gN&j0NPcX`kx&jE$O+ zUilXN-IJu4=0^9{+N%?7)$O8&rDlQ?#9FJ)&2ke-)r>uXErR<-7&|E=^}i|d#m;L z^BybJ<1M}7?o#sq)=Bs1@h)xpp6-9(pP%g@|5c-w>iFZnxxAC!FSzj+`nxFcJmKB9 zo4Bl8W+Y~=A~i4Lv)h(0nFGyB&@JOVR4y)EElQGp%=JWg|M2)X31sWPfktUG%~{sE z_dOJ=3LAHYm6alwwKA;QGS`z7p;JH$WxVtal1o!=U+PVex#6Sc5}PY`C2`LttIulOHvbVmxH7XI^@?+NMSm84sU&633aBlG?bpkNr@5 zPoOq_;rMY-`X!?NdRs8P5sQ6iVnHu0TBg-pVtv-I2w2k^ zfh&hQ5DSj2_>EYtI+zv&W1KnE2CrcuZyVxfB8^#CO&1j(cs&g8Z)3r+6<><9X*FyC zT)X`51a2GB*x7+qXg2XR_?IEf%W#Kd!Lc>{HNceu*F@i76TcPT7e@NS5YDcAOW=Po zz8$xuYn!l|&ze3hq#l5@iB5L(rLw)!-ta7*%FqkySXrAA$jRqx zS3X8{j6iDVMqUa{A*@A{n6@A`zdXO4Cq`+}R3j~ys;#LvuglI$l_`X@Wi$!7GqY_j zMQ@rO4AfOMO(S0GtKw2_w}#@iF+Rcoi9g~}wqNbCQjGP%n45sUm^#}uVP~6OuyDf~ z)`DlzGFUoBJjb{1=NCNo!!CSd98RbmhZAbY;e^_8IH7hNPN*HXu0NiSdTks|s2ztB zYJvMBT=>`H`BdQOc|7?8`coLZ9||Y{^#9X1oG?2Mr_qkX3AN*JLhU%5P&*DM)Ea(Q zlDlCXPN+p58<8L2GlV$`FcW#cig*-%Ch%vHKhoO`|9Zq5kLO2$I~DM%S++M2|4}?2 zkFb=^EeN*`_%jjzQQ(#XDDL}kr;it+qH(|ZRUeFCe# z&KmQ@J(uIT9dMnAdwEwVqQU>TKH3LHmUNp|@WMv`3$ z6b({U2KMX;bun=J1{RK771p4ev%Yg~+-$531@_hkrR-b<%}o#$UMXiL14@AzvJfvF zoRyvI5YBDIu2OtT@u{A5!ek)c3eOa$lbbH-ii)rN{_Echo1UYq^b<$Kz zEz@Q2>Bj$f>ENvIq^2(N)zpQm>`K|~V;P9IlJ6_!k!tBWD?8ayt;;!O;UpW?x-m-O z?RyuBF3CX`rT7d!#7hTJ@lBX8?o!fF1O@C2;f6*WQ(INVj|?@L-B6}emMCfy`Au{MZ9zHya>3VfS1j(y^i>c@O&)7QaZB`ZVm9m5q}YI%K#L2 z3*1Wb2h2k_N<*x^JQrmLpec4dg<`zd{q03+e|u`~Z$D;N|GSv3mKtppr^;vxQw3=~ zw=ZgSp5IcVt>TmzZ7F8+nw)30WE1XBv7wh@JkR**JFa+_X?5|mPn%Ql6WLUq^vG>Tdh z@qISF^%SeYmQ%F-{(Wx~@pvM-_1n_z-Ix8kxI z3R8+pbFyskgAq3j9R12{(x(fBh9TZ2xY+<}`s6nld21+VBitzbDovY;@@pu=>vriQ zUMIrPkMgj_55sQ<((Obzx?IVMFGpOuw20g-Hu-D7(WO4wz^%8zmjgc>=@N}LA)GaR z^3O&X{fB2Op5Z9m44d?++yjuVhVoHf*7zFyb^uTDw;_y(cb4PBv3jz(siHpE*buA= z6zAn8@4Z4wgp~*z_2)$G=@(bM!c?)fKb6}n>NpP?InGV`15r&;ovDhFN~Xb4C#o$C zkVrMtAc?9m;YlSEp42koNi`F`)l>l#j_{?Z@`NYVOn6eq6h91q!qboNq?!p&>Voj3 zk_o>F?}X2WOL%e#PbyiDkMa_pRI`p>W#CCA>v*&Lq?+~g5uW0AA{?JT>3wYMJn)n)Uc5o>Vg7iB5zk)vV`_cPbx! zC*kQwcn$u9XWvi$e0);Pgs1XSd{W7Je6xJzNRRNEnLnvyK0c}DnU))ac>|q{^*z_8 ztnaB|)fbyzp5HRockr9eh;L|VZft1|__`bF3=&d^xaoTCYkFJ;fs&vCq%Sx#TXwMM z>*c|sV!S886&dereJ|*ZDy*#U9#oLXN#1BHSl?5Dk<>|{4hVH_sAG~|@6L>r)x~ay zfF-^W_E>|{i_uoW78xJQ`rZpG5|t6wlM)hnIb_84BQ&aQ06`l^TVeHJ)HOCMr{|ur z!N~ZB{qq-$)wW+ZUdx`|rMu<-S*^RBfib$fwe3Yc+y#Y4>Hh71T2@N&{;z729#52b zp71@`cj9umJEJjk6|H$8pWU{6$sDL&f^H#is%osQm;yl#m@-UPhy-h<)x(RfEj8}5 z%ysE&QIhmyt|!9#hsSBe&1OGXbB&*~uRYB$291=nkunf$nA%)dF~g|2d4U@Cp6m^iQDeiH{h(j(yP~e( zi7hMTF2_7D>1OW!mh$~YaF(w~c-|B*Gxg#xQd2LfVc~b?x3gBWx!9~oO}%)qr}ox# zkTbI-wJO$R#vI^=O^X6^J!@4p_0lU-FVQw4#zE#LfRi2#x2d65_f?e^tMGFh*1S)g zA+Rp<>4mG(JsWNnFSCq}mv*AY^HNqF7xl6_R*X-eI{wu&vnR5(di=8K;wbYag`yvG zJu74I(c7|VDor)ke9?NZxmdept3%FQZYnDV^uuXcTo}XjFjw2<@WZOT8i-rWIVs7&^jE- zi|$P&x)K|cOKMNQrI%o>tWNJ@CFV`f;^_>%ppKQfDS@@J&q2?=#EcX&lL)yvDKv?& z7E+zIAl6)--!hX3;CG7b-$+|VlaMel!l#^;U;W=|)`IU{t7Wr~ILWH-ssFrr-SHpZysiT<_8&K|dl&9_Kp#BczV+sH ze&GLw_b$BG!(9#-WIit>e?T9EAzXjJ5WvlNe-2?@1y4@57(sj|MzI{zz{({0s2B5YNAd|Gj{zc&4&l4csI={}|7d&RD=^ z;Ew`K0`77%ZWG*g_&or45n(6|QC0I?lpGf4rPx7tit+qfwc9VhZxXA%&K#H9X8iBNuK|9)!25W_Sq4~v=a1q46yPR2Q`t^IoU8F%PWj;dC4}1vHy`jB((VU5 z#m$5J3jEpue?(YHL#*Y;y>v}U*u45pYP5y>-wcDjb!{T0MnC2zpi}10FNk|tF`W+K zX>_XMEBftE8OVVAGH@#g3rDUBYtYSE-#IsK(MuMwi^A60pp>1fpt%X6!n;@2bfJ+_ zV1_KjO9y9VCp(05Te7PZpHh51=w+mf;_~{P@m6@ZeA6XeQSqHU@reatb5v_H&0ubq z#a`LkmhQcF)fG*Z!Ru;*)zLHmNv7*?E!;8%iUz4F1G$i225#TL!jY@O8gz5kb+Vm2 zxA|42_*{j~O(+%Kz2c?|jg&GoWFcNUh>9=nsSCfFx=ZOxR5=9-3j9Ary7 z=aYRQ9;`ft3KR`eRR;1PzYN^IfrTSig*E8rtmkAqcrJ%@lIl#R707M^pz!XMMA2yf z(MTyXgZA;#K~#MCzN~(1dyUkN3-t0Fc3dRcK04IZ5$O!h{FXA?=GmKV6|<8{xSW}7 zv+%5GdN5E|)ikZjyqm$zQl)sE#Ju*hLAJNgjq-S~f!ZL;Z;(A}&A;AeE%@I3?Y(kh zR6)GQMd5mtfu~?!_;J|ly$Ub}`@P?RI}VV6=i9I!JO}uX@xB`Gb#R{n9Bw}Ug8Ts) z2t&AjfUg67i1*(h%qxJK0FT4325<_1{LUa3``f4E`5@#m4)8hrZUd~s^WWi5@kasf zCx4{33;y|d{so?2ga18%Dm+u!YJj^4&v)UO(isDI5BMVi7Xf#f8FxC|r{Q-$;01)C zG(=U+b5V8xnqtROD8}>Em*3w$l37@z?otx>x99lO2KUt5;LaZmCpFwENR{Ch<_gk! za$i*IJin!eTLmdG+)~WuRXNXW$u>Mc#m=ZvjOQ88AKfY(_v!D23y9Uht)AK&QC~Y< z2cHs*RJVffy^w@wq^vIPeLI-PTZI(k_cA`#b?}AJt5One)AW2Ky5glhFJ&unL8{#< zq!_=K@g?7wJ&*1QBG7NMn9@*>)nx$~KBCr@A&9=b!gM_!@aFkip-s3drwG^43$VRwUxYqQ; z@Xta#AM)5}lfH&@=OPTnKPeI&za0LZ2-7(dR}!lTIdlE5Z?;bTi>e*ASj` zvW~|);Yl|WzSG2$P9}U9`i1bMn+eZ!Sa_9zC;dhER=g8FY{n;@Oz}xC6P|Q4;Ym+Y z{-kdR&*GDACVaUWpY$5xJE7Mo{wy;->0^pdI+^fPw+T-ExAliDDXkIG&yC)?i z@^V@E^Ji(qA%LKb{ZWk6gHgL`GSqXkYB16t=S>>}wU>|U(zahWUMn3lMt9qaSL^QE zlT7~yOZE3*g~t01ZLIzto@skgeV5X)}DH#fCZRn}a~_OI7f)i)bEAnF<_TWTws zaNj~@ARl|&TdK^nc`3AY;(-yIx%->&_YV8(>h~v|HGbD41ICbh;KT#;_tvi;*+uV{ zzkf+7xqrAhLw{d7Ww{=%_S0NF+_hhxGM4aX+_oV??(iok>hFU`yrsuGV#H~BxSFvS z>G2-@(Qe&snKD`rCrUg|_#SLMaoI!5Xv|ziYhK7_w=G{X2dbB#Tgao95YsIp!J29H zknM^oroiVExb(FsN%}F@6XE^Cq(4qj1f1SxO)f5r7O2D^)1sWgGS2PNErw=Ol_{Km|-+k`GFetp6m^iQDeg;`^yz| z1y5{QDNnIhYJU-&`{2SqB+Tba^4{;~MrNwmC_+jK(4xojCL_;gS z2!2#Hq7RkRiVwp-j4+!3Tchxz2IB?!Ff5!l4;#a(>7wLaHBCfW0Z;<01ekAvg}Kw* zD6=%{LoqiIni$qm51Kft=6Y(PY&1bqrzdJ6o{^G$se;}Vo(+EYW zGL4Y7j3yy>W_Ax&4fXaOfA5N1@fs!q`kknGIPtPTop_nTN1a~xyAgv~3%+-?Rxhj# z_gH;T`R76$Si2Ah)-J?>wF_}z?Lr(_yATJ~F2sSg3vpoWLL3Ua5C_&S#G$kcabWF2 z99U~S|APDh83;qTet@q7eu($qAj~U(n*fi)uLf`mfc(xN_k)Evuy!F1tVMo+&*66) zU=^PK4u6V23UEL9BfVYl&&TsG@cbJ5?*UZdnaWlJ+(mf43(u6!7{GhL9|^b!xXaAA z)8Rf1zxx3%APl7;s%oB#lEdP{6g%urF`i#-`uZ=L$Fb_$;{$7RQs=;0Eh8M+usala zX-_CJQ42?Q!95gk44yyO6^aZ5{$M<}117@118|!8ya3Oy0k)X$k0YHI022`BN%;L1 za3kPJggF=RHgM#(6>el_C=$f;ameFFKo*{_N0=huUZpsI^8k0k?>k8E=kWg&a21|A z$RFT0%eD;hPsVcyVJV#fa9@J^4Zz*N{R}{HSHpdT`~dGD45cAf^X(76y(4U104Fot z%JVr6x038WB=kZ(4Np~kMZf(i13UMGx)`{XgM}kkg*E8rtnXyZ4bXQLm6g#}VWsR` z1=@2%)vi)}O7W?lb;4vI-U`nYr<0p5>57W)qAwQb zhRso}%{+s->l@o;Yg@Xv+f`RIRR*uC305b4Cl9w>E>oarkg77^gZwgZ`vw+{Tou-! zo3pNy?c}+f4nQiDe$4f}a1mR&^9$lWBw(PM04ThBB~dime>76c%%FX|bPyF^yi*ss z#cJw8S)9hxrQSseyNAQ`jIdxHhjq!oN^x8l} zBTlQWs$wVF2I^XBn`;_tYj9^?O+!7-vV}(~Oj{Jf`lLY7ASGqsOz@3?n;f%nd{Dy)fDUlVqxTC^m*4 zi_P9E0j1dR{Wjcl02-d>U_-bM_>b`3iT7H#%K(|?b1V4+G=w2sU%+5M81E|(=4HT* zfW`2u1`Gv|-*9rV!MzdB1CYl#fF1Cg16YaYE%2xKBLVZsAL;Fce;%G&@w^fKa{-lj zrm|H7SBB?1@l5HI0yY7k1tdo{O>r&=fnLLNT7NzHx1E z_p1%=skyvyrG{I%DKXsosx#B=`<$6A*@hRT*daA; zg!!uDMaJ`Iw~FU|_QF*b>f}~WD`!PcI^GDM3XD`2gJ&70XPo!PGg4L;yRFzt{6IPf zM`_1vF0U!?(6o5~E1qxz5N{LG>BJ>ORy@&UI}vv=UmRgs{%Q`qT=MeTS&>Ul3_!yo~ZRdc0^q&-i%LrSx?`@ zv+r+3IVnE96P|SQ%q~$M^Y&ON`+J_Uzo*8&7iWG;+28Y&{XGS?q&qWPcDU&4<$Gv7 zu6=?QhlWdE5E<`mf6wcUWiTGCcj?PTl3!aScWk&p+#@>^K9|)JcE{AjJ684$oN?HcbjX$6U|FyhV`U`L|$Gr>%KQN;t)X4 z#?jX&>A|Qrnhf<^DGf&Y<2;^kpf>oa)!O#!#%phOb?NS=0b_M{{Q|@Pq!ot0uVswx z-}t5B78M%)&8D06i1FTezJcHLxRK8As}1*)a}9UXTqE7}BlUM@LP~1GVwavNBRF#v z;rZ2-SKK(`Ae}N0%WlFqH?>q%)?CZ>uh&)8Hyb-3>KZCrYAc#*Zm6mZ6kv~gOO<&x zFNJPsX>M$34)~bSmJyt}`B#uF>D09+au$CyiXb{!?`Ksws!)?&X^=(9=Ea zrhfXnDDgbud$9GSDE9D+(U`f4*1V9ojaVz(>n|Gye*s_fQ4PdFihNu1z&ID*-F?(_`M7qEv)O0a8~@OSnySj z#b4v$w%WwMlHy?T-x}cNARTLb@}os~OW>B-;Avr5De$yF&@TO9@SlZvmm~irc($fL z7Yo#Uz|#V}P8)m}=~LM{DGbVKjZX{Q)&qYM(x-4%{9^cRL>lyKqckk-`~cvI2DDHy z+a^BIE*p7#4fs+#TjNuiW+5Jx{Ue+BcfxNZ%9aVY9IiF~1o%$|+=%p7z!mY%f_xYj zPS;g5)2XsKzTzBafG0VpA9E9-iIe~<0Zxnn^G&cYciIal^QgX*Me~W8#;SR&@1iE+ z87bM9D(>xrwG`tM$asEeYaI}!!g4e@p|@6oRDTyMF>ibpPiW``b*v0d39OZU4tn+_ z8ha8*?cl(*1V$gyI$uU9>HSF4_1us2&hNGOzO)6g=JNcOnnuV^m1%^uWi$!7GqZcJ zYAD4HK1}CI9nEVGCten(7~|E6m;7?IV; nYG}1S8EmT`zZ#G?efrncHzLtlC&0ZOelq}nLl{a!tmd0@|8i@%FIsPa)NrfN zakzz)^x>B2CSbVb&o793W%0m0dqQ0b6b({U22eM<%`0x-pu!q*&eOqJ-$|Jp3hm7e zO4%vpsv4Mt$pBw*m9jGwzYHCmm7VMu&TZAMQhW&$dN&3u{8zvCsuW*z&&^Hld76oe z?}H~UTo8taBCwffFn7#iyKHSs_jbGLil)lobv422XhUU^>AIXnhb&W|XppKhfXfY+ zF>w0^7LHsM)}WiSu9NNLxttC_>NA;4d2Rxr@a~mF(P;nCNGUUe_VLm|RD5wyT@)vgQFj68tQSDE$y9;T6mdcdd|1#kU|BD2B|6ouv!%|aQg-pj$9SipqsOv zlkMQSoYP6FGnu9>+yp@3-7ATr(f*^6Qf3D2Vhtu-W^!fD&x@{tMi*0eb+q zVng^@!2c7^5#X+Y`)j~q=JNvb2kbH555V)$fDqz53%{2DKL9)iziEJz0p#~ha)gWs)y-{Sdi@Td4A0QZqU()$wrxp-cH=MC_`8!*Kz+ce<5gXcSdr*ujH ze+B*sz;}TAo*B0R?yul?AK-a}p)|zm%X3k70GeXwQz*uJ-r%06Hn^we2KOzUe`#RK zT4uPF?@N{8mfO6(^*l-u`-oV@q-FRB?Qg>$r%pOJJ?_MSc0V`be^z)e`@ znO^VCjFi>IZY#DDKaloRgVT-C*6$S=UpjY68LPq8bF_Qm^kvUyqNi7-?Kv-HD{(>W zJDy2I^jUe+^?WXvV!Wr;N36uJWL_VTr6aB52G$kGCnm27fBs zR=`}F^t0jT$M==PEkhV<`r*+iAN+lA+ic=f{-wZoj78R%43qC^pW9>TSi~Rl(Dr( z6@BYHg`^*I6QPguJY2qR)b;seNS z>0|?+34e;e$i$OAr1&dFi}*Z*r}%~rC7o>GDIbbI0pSTx`k3NxT4m;MroY5YpY$=| z>76fME5cKJmOm^1T%=3sw;~_HlRl>Sb1m`PSbQWw_?;$x8{$#?p7=SZ{stBm-hk172DCZ3hA zli^MJXh@IeFO~PEE>R!z_E;(VdycZd$G<1T%?6W5yfeS0?C&|s{=ScQC?7Qg!dxKWdbUpV^JYEZ)%3Mf9*+ot|+TW9(k<>|{4hVH# zjE+lsy*o2fRu{XS1D0epSTUYo&lJo4Ua;>MG$=EyCnY3S;&1-_v<8i88-Sk;&5d4N zst2QXHBrr~!AO6cH*E~uy`jEq_kXS%zvr+Yj?vu@?;cC8Huxaj-Tlid1OH#2{=Vl= ztMqW%wtj~H2hZs5yANzI@WC?-_xyo6UfX@xDm~qjtBr6a(@OR9wF^tgbta^wHgsE+ z&j`+3&;6ON6yYadxU^WO48*dV@XbvvRh2c@vi<9IRrSrr4v4yj%9h%SrkWe7Dg%Ys zlB6GVJrUkNJWg$%&3*%o(qP`_e4YTjucz%J#5l-Y&+Vg{dcjOX{Hj9a8iJHw=C|8QbUZq%Y)(4BztoZOqP1^|k9K>IbMP^ofCM`q* zOy~d}i^HsVTIe7BBWAUC9KLfZkkv8R_ zVG*7c{}KErAWSysK#K>hcq-RSKsm}k!G?ZY5uX-@cLExt@XmsK7#2>~RW#G7vRE~p zm%OW{l>jRNM%BUZM2+uG33I2>W@*-kVs0WdF|4C`aa!94T5EQ?y^ETNXQX6b%HP`u zYpG6j0tZC=(AGL2O8)KO=~ij2-z!#P-uNt>(2&~;>R1_^5?CvDxzp(ww#+)?A+7&Xc3GxS^4jHPqJB8xy4Y{#2PpNLxmekUKNG2djosZ{|Sp+87^U zfaKxCOTRktGKG&ioquh~Z&(YyceR#(-%l}k?1!EF&&@cnb~6sF-HZckH{-zC%{Z`j zGY+iXj00;omO|=KBG7J{k~0oM++p65t1b z$KW>&a58}WzDe!}n{i<6W*k_H`~d%h->rb(;`wjzr}!fP_mMx+`x5@ScwT_#4e-Ak zFvTp}G~m92=R1I>bV>k!1^x)YcYynz8MguMui$qd;CY0hG(=U+b5U|wj6Qc$)%2E% z`sQGD&D3Vacn~*r82{3e)81v(cO<(KENZoBps6lc-_TTNbjEV>Q|G|i6(-?gXej8rug}QrSRJZHv?`dU^kv`15^Uu22j~9 zN1WH-{t|df=R&ym!aW{P0^9@u#hn88TKG)^e1LaKL#*bv}U*t`HvYPgl>INVCI z`;ah{#9;;s6b({S1~MSK4BU9g!jY@O8gz5kcT(ntJbQD4Qg*I_<|c>=?_OEcg+@w& z8L|*B9h{Y&>=@3?_NEk{QhYt=Wu%MZ^7@_eR(Q93(uQ44(e|n&({;EUZkYl_gH)A)T*xm2w{KwK$W>ttx;g7Q*-oCz z=>Vic>Bn5p3m37aJHH^#WsBL_1Kk8b;oU2VqS5}Nky2&`?c=3`sQBWZy2w>i7s}#< zR!t4y_c0C9W$@`HOYzddS>H)bUF52%i+wEUZUUh2Uwvg8ZzXq=jdV%ZS=q^sYF!R1 z3n$sA)=dBu{;RJ%Qklo;qM+P(6fYe_#dqkA%}<8Ssfz-({Ia1Dr`1+fu@h|r*ec&# z(^y+GrJ}i}p&n=1(%yONIf0Z)l3DKxVSQ4dXpoXJkO#goaFb&ej$9SipqsOvlkMQS zoYSckpS#ex38lg-8ZsGB%FK|3cqt_*FYxEUIS#d!WUA%27VKv-A}yJ`2S zqe)h2)pqraK3NB3866=UEA#P3H$2aj?+DX(ZyzZTm|n}QW%aG@j4<&=O84) z&JGqD;8&U-k#zS$065Z#y{@Qa9P98ENVTVR*!E<>SXqg6_=00(vWmYB7abQX zA8Q@<^;{bGI(%d|CrKL4IssX-I?T^MTB*a=U{Rnx%SIAr4en_QVl7yClFWC<*z#a4 zv~&Ia^-yHFSBPV4uW3(zeXT~l3V^7+W}dqD+hgkhK8^7L3)Y<`^Yu9rY|s1KA=_BF z&lU0Cv#_UDdU9g!W zAx`|Ki7N4=GmxiS#pk1oMXowis>m`e*x+kkXCfeWw%$3*DNpepLv*(6?lH8oeGQk9 z_;KguIAvtZj~G*f);yg%cwPQjd5Y(d`g)?g_^ian;pse?tlVUT&qq{}KQD-tkF{}# zvH9OhMZR$mJ9zVokR@vzVym~KHccEBhI%9r6S)H9;xui{7J^Ge{Pj|S@}#cm$>G!7Ls-rq|#b}vRjey&|NiKm;G|08E3f3sPQ z-ACfn5UXAtsMvF|<;1XQTIt3Mjo7`bLQBK(J1=G@lNWbUY<%BFGEDYjfB*H*{kdEA z#Btu|=f(PdQ4QGcDYAYxpOvTvwq78Um75yi^AQ`-*Ds99$I^&OjP3mQ4?3CET2XQ$ zWl`D0YD8n0)2t-C`L6m`Se_V2bpOH`&rU*~M48ytjZZ)<^o@eVJ4^z79{a;2tTg2F5x*_F)+rymUQeufL~EWr zJw*{2JUv#PqPnv^{V`TnA{h_7Rwk?XMdZIX#L6d1L?n8~!{2=G3??E4-CBrxF#imh zdAcmxMaR*yXxBHm{1}@zQOv@)X6RfOS1$_1E~UMDcj; z4wKoEw&V?vBbPMr#=BQX#8^G#$Kr32Ruy*6_A_kS9?l zcE_7fKoqeH?~^LDL=*nsD|eN!LW`~8`1_sm6k9`o&sxKGkIL_08Huu-`=C=s(XC-o zPh^?TN|fc3ej<~Vn=JGBi2OSAXR-3Js=8Q5bX$dNX?gv}jErJ0yZ_ITC+lU4H&9v= znfUUO>sTVhe^gIpm;~#95`gjRFzhF-OcUZZ_k;>f*k>_3;U}4OW3ZINv z^vf4X6pm;ilW%}Ipn#wA{WFI#lpW&x?i(3 z=5P4z-p}uO8sg_4y-Mcu^CWox%AE2~S-C}H{@u8(Rk!l=$e5o4O7+;7&u3aJXpaG% zQ6FmQc}cL=*_a<(9hFUto>!hkp8ThYJ+B~+714WMvCU3=0-_MC_?=XtB^tBsOMT0j z5Qsf5-&&_UMRR^m&&>IJM&if)e4SH9b_3fyiIEGhxy44}$n~-E6dMU_r8I6Lj<1pE z`=U%%@i!9NUW%1ZR3jlVjB#!xw!VTqStAi!!-a`eOnH;ei2d=vSb z(iywA$)_RGZOvOUbjy=4>GN_UvV7&+$a62t#j91VJx0D3i-V0@-$gP^{9=FqoC|(( z0qeA59;W*j&M4ay>-$AjfVuJ>=mI`3QKyz}mdeXb8Soj2jp-R%qB62HrV?$0vqfO= z`%&4%YD^_80?*C6zoi5z?If3Y&M zHf^{D)bXw{t8RH??ShGnkmA_x?ww9~a%O$>eR^ir=kpQ2?FShd5;Lm#68%x0o}!Qh z`^U=DqVjsQT$RsD6p|?iOXU^6kd$S{$|y=mB=q0b0c#4GkQ8*c9Awy^aHN^1i(wx$ zbTRDnd58ky8zS{x65V5Q+v(m93ug>FAy%Fi0qM~^H=ma%AcIeo%F9ha_>9ChvERv2 z8Clvy3EDX~FWZJjWfP-K%$Mj-{X6ThKd|OSY!kPhiad!ju{+Ov0-}&D9VS(1iH6*{ zO?>1VPI-!LVqVYM#CPW(@*bm(sP66^?v#;Do7kh7Zayzjnr}W+Dla!_<}(sG*5Hek zkyYIlNc3Ws{_E0qMo6*0JQr!E_TJk4=x z>~1rkhG^ZtadZZ^NWZYS&~B(&w&9WP)o&vCO2&xfi#=TC8_gh271 zCicMI`X2Hm1rXZ`$|oQS!3md16IeNVoAi6Zz(5b1P_U~CFJU%T=#CW4|y=xB*W0gFcH(X1bzm#7h*x=t!DH;uq& zB(|wPyFMx-OPeaeNoNDWEkB6LCRUp&VIbJC>(^Ttzr;3m=1s_xC=+{XhfhEh`LjbZ zg)Wj9Bj2*{U_UFg$oLUIbjnk-YUKCKsv+KJvsW*}M)xD9jBKnLJzD6*=OupB;5kxx zxk)phk;t)rcg4!c+P)P@v~S}5k31oF-Q$!eXX!^5sArabJ|FShK77C_AG=OakwmwY zr>Cg$-uqdsJS{4(N6T9IyhI^+tyLqn0dNb`Z1;E zV(I7e5C!DnUrBwJL@${)ii-mBz_M6*iUQ*A(d;*$mna~&ESJj5O+fgJ#5OUwA}S+G zn<&vObZ%a*d@d@R7;R#aL_56diL3`$^CGs1m###fM48yu7Cr${$d2idDzrpH?%XCG z(dm?@*d`YAtWD(miJ~-XT}~NAw~5&O)q}Mhd|sk7Z~L88UT)INXC!j$owcztvZ}jc ziCzrfaThz>e|#Q!c6YdA+iH31pg3+>@B&g{wyxqOU#!kQ_;W1sSsJGvO+)i}iPiaSI*UW$a_eIRp*{3B36mNZI=3ciC!>Y4MlnU%NFFxRp;2%!oT6= zlQmmw|6}iS1c+_R;1dxo@7q3*AwZ5#LI5fr8oQpAT+FoQLo?GJnfr4=tR9>D`Ao%b z*yevkeW|4zCegd#&HduHP5dY-n;6}&9G`?p@!ux)!!E>$e|kSG_7pasfLQAL{v}sx ziFS>*!;4DpYulVM6%GDahA3V5$ERO`5nl!ySa$1=y-#(sCDo80;zCXH@4-RPe*Jq zuGlF>0g0jA3-^Ec57vVbJ5iVHMxJITDt70OPeZi$AD7XGS^SF%{dPSl{+q>~^E(Ih z+1u3ioN!_#e6$}Du|56Nz7k3dPk_YOMx1XgU3`OA-T z%2YIQ6!y%-vFY@8FJ?=QMT!39V5f|1OdLI0|HS7de$;)(N#*4x(R@ZC%fctc%E;Qt zC?ZFm7b{bXi1cVWoX<-Xk$x9Q<>e+Kd`4oW-aauZBWv3zq3@j= znRmYvl~1g;QRXDXbv1uJmNhbB+xVx8kg47_#$E!!ryvU2&%Y~IY>6h_xov#l5~obX zwy~&ZZR0V!`rpOqBTDp-zUP#YP21R`DRDk8QKEyFN#*4x(R@ZC%gX;RRz^`uPv#K9 z>N~D@mr+veeP4D3GVSht#~uM~oOgEjWR*CIIXQq-n8C2;iHP6-QMugrODO$58{c}0 zeZM%0`TJGKbnhqzb49V1_VVe7g@3wIuJ95w04;B3eqld&ePSvyH3zS;P49df;tW7j zO&@Nxe{r7eS+h^Sxat*F1d)FEwIcmuPyX=2FV^UchFJ6y+o;?gO;7WAi8cB^jZ%5V zU!&h`ij`4xjh3(gJJ;wxH6zpR8XemL?p&jPaV=6|HM-}Ch?U~%8FJq*VX)3OOf%CS z%>m?rSUsKt;4>BbV^>2zSZq+%{+PsI&Uq5xjUPtk6RSU#Dm)#jFqv$3>qE0zC8IyjDOuL(m*t+hy4Qt-F?@Apz7pZWH zH}=ARJ{_^DY`sU01QPQAzj|i&MAnZH=Kso0imJ`Hgm;PD5#=K+fI^6lmU z_>UHQ&^13n{;a)?T>w52k#hc@Nl`9WVjh4O9NPHC&M5DN*E2wul{c}t zR&0^^W0yE(D%v`VduHq4S6hn`eb!S>8AaPV(A(*OMDuxxA9cd7rSfu6|ofr$(>hfGkCiX3TNCIicaB}k);DP7RC!}8wO@AtXSJ@XKn_ptU)w^Q)NIJ|^dzn*yUDB1YS6^S z7j+{RS?nfzcA8klyUC7S8pwAOL)MJ9^SKYOb$1?`(Fc$&az-Bj8SEl!_t++8NSLu? z&Mva@Weu8ek*FRyve-qIykcTyxd?(dsBhixfs)#2@QSd=Og-RnG;OpzpIEv5A1^tP zVQrjpP@tXKsApAZfgP>lZ?<6@BptO;eq^igUu&21UkgE58>RDqz597yEd|4*kzKm? z;SecB9HW}#$YPf+`KO7M<8hS$#l>{I5e|!^AF$2e zaGu3jWRE;@nm{3@^*+X?>vmL@q_zTnc{;3sUI9iUyaMk#!?XhR1`lYTx@VyqIYV3f z=AluofMK#&(X9Z4a!PjeOmC*il+2LNhFby6AM`j{N>*?1DV!XqWG(#!+L@B+S5?sh zf0On7ZNoOmTeuZ4bU+BotpItg;N)5XOV0^`QV+lN22a3koF${jKnV1&m})zwfrnDxvh{n85_eH#VZsjhmih8B3s*wbVhc7vfA(6;iX z!c(_g@Oa^}5HwNnkay%`3LZ!9f2qS=Qf8OlHY!B&3m)Nz8=mtnS6xR(v4o~je6GEKSgvjzrC9WvD$=ok$ z(8SIcbwL(c>?YGDn^?uW$>&=%knbjj#Cgub8JnG(a5rH5SM$(}Zh&N!8QlP6u!~$g z-8L~p&Wwd~c9Gg|YEX6&X_-dsB*L z%Oi`AuBo~yT^#>2SD>BZs23;E0z2B4TW!NOXnI&2H?@YK32}`4Xv;s0xjg<`tdDXIUKn_thjSbyb+#% zk43im8+QG0<*@W_fxdT+snrj{Xom;C>mJ+S4Vn9-vfKZjet`YW5`nq{tUhuy!rg$S z%eunf1yolJPkqX^TaT8HxIu6U;Iqql1@$u=xO>1OvO!xVriB)_| zzq>*M`7v#%1&3q0;{kydVp?w~hhzHbm14!kbi5G`i-kY3&EHVD#aLvIocCjaLQLzW zopJNN`=YC&C)W)a@@NQ@dic#5PByrWvt+FJnFdWrM$}a}WU*x2wb{ffUNYu9u7P~X zFw{uGk}=~6ffgh~-|Mgv%^UB>wulwjc=aAK+Tl<+^-0?j7`By;ifjMm@)+LjPkBn9 zE}7Dk7&OA&fSuc*8<4BEbpsY{mk~I{)lYumwfoGornSo%PRZy#{Ncy^TA=XZ^>hJ) zjJpA6|5of2+)!TvES&SU(7sb!g2c8E)g(j~ZwqbDnOIrd0)kjww>|HHlIm(W#D^;f z8-DL`RCP5#2Rz)kFTLQg<^_Rvs%u6c2`uoIF>SYH*fmv#Za}Y@d+%_==59dqA41SX z!6WiVfcMqzo$rd1?9x~LF+@r!ctl;5LKeI9s6U%nSuTwr{$^GCHBeINYN`xMckq!h z54e@&II1`lfzK@bP^s$L!_A&&A(2ho3Uk2=L) z?C#zkNBv@k7gDIY!O_YvsvwUIe!m$lIX4|{Z9+prCq9u~*T&k8{k;u!gB z!0lW2-r!`IUAnGch~$gou&7o!ve>1EoNZ!dxio_Ko1K1+2J&`2%bSPGA8Q@qR+97Y z?gIr{$iMYmY-!uE!XJ9_Nvds-SaIi=cq2Uj73bOJZ`k!OT|8y1n?Ijp?j0;pILGKG z5VXUC|I$#~;0@h?=lYJhC;b3>O}Rkb0aibOp%Lx|47s2by8+epsn6W*YHu9p2Ei+M z=ptT0y;_Fu&oRBJQd>cmDfzpYytirFBeFQAS67=@#mDrLS`FmKv_Z4NF}+}zKnpRg ze~)lXU)>;9TujFs;jlP2XPdvFa*MIZ9(mR!0)?2?i(3~SD8Hd3^=||Ajd*VbZIHKcH{i<4Ls0Gp$dkq=*9~~=iV!IE z@LOZ(2H-Z%l2Li324%^pi@FMjES8KxpE9wEmyDjHHIOeEh8js&GRi(JP$wA~d)+6F z^e;_Mrk)=wR$$}R?}eZp4wd`H+m^u44QS0Z*1J50bJ%593)Cf3`pD4;cLOF)C@uVL zKy`gxS~mcv%_A821sOqJapeIUO>fW$uie3GOlwzTIIyDou;Tt@qCnxp>qh_#GCqy0 zpCon)Zm2K8f9s}JI5*^NVdi9Q33yu=7S$v~7H{7u2(gCprpDQc7kx_ zpy?|fM^#rt<=}fWo1SoAjMa7c^#biwSN%>CT41+-XR2w~5?;~8p`Gi?3oq<)H{i`_ zA!wrDane5&eBkRLQcA%i>Z(-u37Fk8Osp)IMi76qr)O&*PwC{QUi3G^7koI5w%ihe zLN`Ftp}1}Ua(HqN-4^1cCPbFUEOAA7=X-Fz22JdIQMY7~#XH~PMJ879ZgR^K4dlCt zA#ui6e7GAh^WHo(qZ=Tdw~TH8GT23~z0Wo=L(Ytab9Rvl4{Feai$t}`k;N`@#VQjk z%S8~xL0z}n10}`La0(2o%!k%^98DZ8uP{zLqi%P)IQCyF&`xpG@2H>!cC?otwhh~$ p=_rof>Ca?eTvu9hB85GX1($U@{=biQIojp*r%FneZ77w0{SR1tO}795 literal 0 HcmV?d00001 diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz new file mode 100644 index 0000000000000000000000000000000000000000..89c505c6693900de9c18101aa892f42b817402ff GIT binary patch literal 9138 zcmd6sTWF+b8Gye%bP4OWj_oRAt?RFrZd}4LUE`W+4!@*@oR# zBG&p-A_R;Q=n6q14w8!?DFj^WUgYqDi{xT2rcz-GE6&BDx!BUVC>kZ8&olo_c4iVU z+#=1l`{q4-@B9AWd-%TpFe9h-?LQo{@$qotXLo)w*%N~BzHm5v=j_-E&p%N(es=78 zdqS=*R@G_8E028j%OhXg6TTb1fAs0o=f8FC=;xxN{bzcQ_C!a|Ja_K-b5EXq;<UgoEvH<;@~9u0($YtUur1iO3Caqcp(2z(R1hR$B_MR0;%jC>AIHVb}fD1>?D zir@!9BW=a|(c)_hlgNde#k;>*JjISCepkgi(R1r%%W6&Wve6#It7m>U zn=&EX4B|vH2*r-H{`Y_Dz(>!$utNj(_C&Ydf&|e;RUD75X4t1ObX9ZmeMD(Ln*S>J-U8om`IxOH0dDe!coE$87Up>5x8 zASDS2Z9NNJ84RHey-2HfL%SW7tK*h)3+T^n;Z_<)=2lUENjcc$Es^H|B6lrS_!)TPxp%d_ zcz^Br$v>=&aI<*5ySI3rYY9Oa>;(AE)ex>39NlRmyO60!{!d=X5nLK&pzoHKz;o# zBG=Ek)oh)Y2D9e7itHt1>Nx*6bOD%Tegpmj;JQ6W)8+Q!E&ujQdy{R&`^@ew9wCNd z1EE+o17_Ua;7w2wS&6`Nz}HEFJE{vh51fI|tz-$-uF9NAJC1$8U&>s4fZds8<{UL_ zZ4P(LD6|Q+sN>q9XR#a}b<@b}>q6go_1&@E$X!!;AcXr?vvn-NCxE_e#K+Kc{a!DC zU4!fM+S2)s8{gl*c<7z&#XGrsi&r7Y8Nh<1jljY{b|eAX$nylbceuF*B?%f?dA>mTblHlO(O=O5W#ywC35;+b?i*p4^R zfuJ_P2Dm5&N`qNKYRKH7CP+Mt+yc~#+K^Y_50g}jIEtRwYnZGQ{em<)MeU%AfJd&o z%j0eqowFqP67t6yeu~URfJI617onpp0InlN?wkd8jZ0w11UP{|h(CuXhj!`)AnrIe z$Nu&9H?|kAZ}%3jZzW5p-8l^1Lx7Xu1h|513LF!tg#C4cbH{=v;*^113z_L9UU4ZJ4j%&tu!8uflu<`y$wfnizTnyIHWll}E^H@cLN* zSCKosuX$Yym?A5H^{?$S+AxCFpyBKo;DFIdmp~3VdA# zeGPexuHA4Fto;;p4V_Kq+^zu_L3Rb~HopuVL;Kk1oK;KE??$dax7VZi3E(PntJsCE z>vFyNQqM|01^Bde9exTZ0%dFpV2>Ej;dC%t`y%ptI$rcP-x|1mhw5#;t2^gt;`jbd zFChcZ8r2e!5@7;1i&m#dCX8NLOtySH$g2bG7=aGVcOy5mQ3LYW+S_B`2wk5`13unR zA9;$*p-lp8pl2oQq9SuvTYOzV&}QE~yap8Et-5QfA{R%<>Z65LK90VAyU+{x(^h{Q z4ZB*$UGEnU9D6m{zW261QaI;m;`hFKuBOz8(i3H#$jU}Z)Iuyxwz`qmvx;sFJu6g0 zW`-P_&_(E~JTL*j3D#!}Z8wy`W-@Nbyp}dIz}}%OyH+S-ev-Kq>?!F(>IicEd8#L@ zOMh$V%mG#;0Z$^cqjG?qqWux{hmm^?b+o+$f5w~}Y`^XO@%2}}utW8B@mH2A5%v*P z9`OF;iDLH6v>CE``^F1UW0VmAe^+{(w$gWyr{=t}0t zJ?@%H9q)P{Jn-YKuPiP1yx}xpkLQ813kigAK3eB4gDG9+V;cS5jprW zcnq8aJ6~XK6l_%zY>I%@bB%p~zFd!f6L5m83aocj@4-`VemmKIa_-{qEE7Z&jEZQN zh^S2DYv4J+tQ5gDA}+3E5AD)j$gAkNhpeh6o0*yfX3-l5D_@5mft~}$@Mg6!)B}O7hS*04sP=Jdo`nEAaR#`ru(;leu}|Ze&yN>NyvKedg?fUSqxn z48wc<@^Xgbj#u?Ry?*)74%OSmA6mMI%BPM^qPzxePhWwyG6#_R8dfCYhmD-M$B|j- z%g~eP307+XJOY@V7=8{=zZ;lX$=;I}pr^@P0p4n*@Co|*N|oW)%&x(E$||1+D(E;C z(dmMB4R%lj?b;%24#CIJ`j|kj?;F4r(2uVixCkBwj{?3_mEHHIp{gS`_ZMrjPFHmwmmtuF*9bXiY%bJ0JwL=4t2FxcN)Di zIDwyT%zG_|o#Y-mg8u;4*E%o_9RVk}UJ*QkEExpA1@NM}j2J$pieAqy?1I-{Di7#W zKcVB~e0lkqp=A5Xxr;xw^pJ`DWGPSdWg;^BF%ibl&F8xc{3P-Q{w}{;5)?WCt;8A21KtE>rDP(3--TaDK%_^6NS7xQhkyY`n{w#bS zzO3dr^l|L93C!I94u@U7dMhV^f6ordjSdRh&C{r;w5_T8t`vi&PVt7Yi} zTb4WaPWu~ftL1%nZCUQvJne5+t(Mo>42^c(v3uH|>{~5=^`R}x9owhGmYACHteI5}XLj9a3x?5#C2V=ct<0<+tbv3sRyTJ7u>Z*2#cj1HM@Yog_t7 zX2g3hGqN&XvdC(nJ_9UvCK>NV#CxCjeLvzwRQuaM`4@lhziV~=r&j0NPo92xa@^TE zI{dT8&tF{MygvNd-Qj~de|)RB$M+u%@7}&QzS-_@lB~M;&D^B#erKFT=l^YW_N#(^@%ZSM!^7j_ zr}afXdnLW=zj$)^qO<&!U)yPZdA|@|MkC_{HOJ|TPvN9 zIx|U=OYVmp<4f+>UvhE5`zxoHd_nDQr+K^Ul@(HzK;w2-Car4n&9=Mp^i@eWZ+E@n za_#Oc-MHO7Z{F_GvlZH_c2}19wI5auto-SncKK6iar>Rw!ctq*Sh zjj|IR+`9K@+>;FMfAG=Wn-7Y>{=smw2p1=hZ$CYL{Hw>$KX~%%;ipf2{p7{7=hxnU zg*>V%`pNQ{y!%#hH5#OT~*M5-dUp@VwW7PjutCRlY+@Z(A`+x9gaP#Mrfy(g7!)L!d{^Ibf zsx8hPddx}q?sq3STvQ0W(L4lhPB)2n8i&9#X+`$UhQRXlRk1e@ftx9U#zT*@blF3X z^lb6aBed&i=n;O5h912T2p>g5k8o^}K{XzzN~Fl3`f))N`;%(CuvO$=rK++j!q#N8 z5F%VLpecV_Fk(2KJEES`B&zd{dyLajN;q!_p&_h>!BxNV7!9t%fl0{Us?L)PuC^8w z)*mLZJ!BHsZFI&kR17~xb=wbh`*ka-+;DI_t=Mbcw)@Ms?J{XqmT$Igm#43)r+M4% zFWI)w(v92ppO!P$#skT*B(X3dSe%g&8M!UE`c zcHOvx3TdknqOCojEHmPh8!5bH3Y4@;ys8 z_VvHFnh(#@)6Kig;ltZ^KdOd1fBo?Jr!Ri<>qem;Eh+aoFoMSGTjgx-w~{ z?#=ey<>{+}Zf=C;qGPi z;=N^i@sKdT7Y}Kx%8WU0HRpU|dCq0hOX6IfzAAy{oNp}2`7GU7S;h)?dbW5fvK_8H zOhv+v(NttROhw`q?r?0h;#MtOhkp%|u-um{IZ+&Ibsy^Rr zXHuTND(U8}Z*R%gca|=@^`&QvTVGfnh!%Omk5TK}s&9Sa*t*si{x!6|t*W-7))x*; z5bsn)o3y^2dc<>9y+bBRQw$9xEZKxZ!}ab|*Bh^Tha+=Vy}K8%>OGT3HD=6M^`6co zch&o|oK^4XlU0>EUiBWYH-rVH$xx<0X8zxl3liagW^3QYBx|{~?~pKm?R!C3+-@Eg zcb5-~Wzq`nn+=QQ>8omP9u{|(42x&!vcqC}wm2+?)*Xe#@M9DfcS2bFd9*wlj!oiO z7-S^zYzoO$_F zwkC7wFa()&KwIN8RV(7*fV{bMKWi@CUv@4X5*A=5E8_pK;cl^zwknkv^==00nav*= zsLQk!>i_%I*2J50F3()ueg@}s7sop9cO)LNB+=`l)E?nuHC8TIMvk2Tvt&7L$ zny!og{bGmL%Ek{mlXdZbak@<2xgdu(G91osTWlU&%d~IaVR`0IyK{DX*0AWNgQwo)#-pmu?j2Kv^ zt?a$wj=em8Rk0b7Fn!XT5ecK`+hxZd^|sBMKzad_jy;T$Cms9tRL33;s5Y3-p8K}#&mxa1Ke^m$>sMDv=8K|SD-hY-69nYnZL=RWFWML0bY!!jA#P?g#F=N9 z84a;aTUF~D?p@0BFVhgyC!iS(F?s@<(-711#SJl9Iu45m;m4>U-l%Vg;n=!{82&Xh z#2ZzOMGY|=m|~7LcO?yR(QffSN}FHECjC9Ml_CyJHmw(n8Pz&;C;V7l_5ZL^td7kd zo3nV{yMV>>nLMhC&R9I3&g8W&o==~wD&~0cJS;a(hB!OpAx^b;9uAn-^9_^DuEjd` za*O97VOTuBpcvW9h>WYQ=Z-XfJ+JH*Hm@*53FnQ%Yly`~?^4FmZbkItD7uD2(_SGAebPc(w4wZu z)1nI5B-I_AHLAey>pM|(cSCjm%~o9B;qZ8t($8r4GY_9K8h)9!D%CgK@XPZr)9}-0 zSs4vKdURG!y#v5`nrk9YdI6KEcX&oJ>6dn=rrzOzys7tg*3^4@*{OF(ScuJBQx9pY z`ixO;WuTtfXp(`tOk1ITV{7W=nXB8+=*Q9>D;b=l4Ki74>gfsRJ+Y3~)Qf@6QfumQ zx~6ODeH;eM)*xH zwAq?^@z5)KP5mrQ-kN%Pnm7h#JS^|M!o%_^ZDsGxcI?&ptBP$N35&GLJuDw59Cx>6 z#~$^zjn~wtE@0BJhn0s($9`j~V-E*Z9ecmYn)*&w$G)>{#~u>qckCf;Ri82HjE9uH zna9q}18SA_vQW=ldj{%dADI_7UA+>YF-;vO3$+zbQ=`SKXxIG($OL({oiXu0X!ZP7mTu(ZCH&f-FbVZs|0iNb`p*!_C$gg07R z56RPO51}Bv_C;IKn=TtIXpb42S9&uKKQr26nf9{UV|o6Hc}9CopPM!x<`f0CRC`R% zm#ia%he+|VQTR1(kI{z*lJ*#mu5FLupQH8|K5i1X$8czBI#^FlFWQ+twZa~k9ky$*Kkvuds(_o|6I3-qZ?v zIAC78OP2tPEqBYUu!n^CE9?tGWyT(t-po#=j8Ivoy{u4Kp1-Q;c~v&M&Q;v^lH>7c zmsQTtB|TpfDnlcWLuL3i4wcdCJIM-rI64h*VGS@1aErF8|EHbrO`f!OH`ACFattGm zFbWAHj$(uOxg(B-OYEVURJ9zI*u#0F_!=%Sim%~a!MIxpho+?#fd(|1D(;Y4uqN5!(Ijii&#U_Fm?_4jU_)@Fv zak{3f?Ek9x&}X!B{U2nlvM+F$v3H?2vsp32VVU;LJ1oy!HNy;t>6XU~htU?xMyu>G zi|Rr%B5-=4*BX6srI?pzud<(|$y;SlPZP($jK}A_nMcqW_R6%Cy*IqdUY@_I*bIT` z)9Va@(c|;~tfsqdzRaFpLD{*74H555I`{5$=N=BJI`^$6EA2P3I`v|XWu`cELTJ!crxgjeAe#2tyVO`A@iDEy4qg6ezV+adq|kS+P)xK=Inl%*~64E zd@0jjQnc)q=dUVzUX|6Xwx`>gnn%mxq(zv`j+WzO)rBU}GPLtJT83ZaXc@isldQIf zqth4{wx6dlZqa`BXwcD1!&=BO3_8NHau{?JFZi81=xDgw9-2ugyy~61aNa1yh99F4 z8(uezdxmgmT593hMOtck8`_Iy)iQ1-^&Oox8m@+4t!{PD7C1XoMf96?L z^Oj$wy`+|3oxdvJ<}JTSyW}`2dd^nO$fM4`@r-=x3a(8$r!Zb9XXKL)m4)T@a7f;a zd@pN8zPId*JS5DYk%zQZoyM>;_P+IJ_N!#TF4I=9-_&yZ>{VxwPItXD@63vmR-IYS za{D-4RZ$H#wHIN0spa-KUDM_Ezm;xk|7Tgt?F$@cY-H%q>|@MuSf+jR4$Cu#+HD@# zmfd|>6jkN0(QBWx;8wHotao$bk= z>)39x+}^GJXy}dlkNl+{U@so)&;HfW-_J}L`uBg->MVWQ7i;k{a;fww`%;So@ z%D?y;&Euj&(|KH2icjZpi#E6aBJ4Hmr?cOX$}(+Lxo^0~DbHUOcSfU3AFpOK%IE=aPNPiE zmo&;~X(lWz6t7uTh{Bulz0D$oPyVWJ(kR2xwT&|TbJQp|t9pwYWjHi#lwmnOMV+_9 zz1Z^^<6 zP$%)Sd)OIT4t^#d$_vZx;gESvFI{#o-aTG!**zo-%kCEyE;IJS^k;T8WrWKzZPo1F zaJVebU)A=!I-6a0FYbHE)wgIzR!+D~&zFSD=uuA;F2k?!7$$llC|-Auf<+qSg6A~I zE!yQCjXbu}xE6AZMjm1Lv50HsXrdS+oxAe>n|TxEg-t9PvV=27VKz*dqA*+gK4S5H zR~g*Wf(z@~X~E@fbT4-0l-Va$9-TF+^6+b1<G1?6XN$tjgR6|jU#7jR z#$TSlD&UO9zwAS|e)Q0-nw>}U<>s^V^a{$^c^FHS9n|DAXJK|84#}LISAVv&*)+NM z1oGs2FjWD+(`#@2@Z%emq3rE*3^@V?qCUdoKEf+7Am9wp6GFLoi zEq_lYbNxkI-lJLYb}|c&a*Srd(P_dgc+np^jo05p2I=)j6T^7Ret*%f_r{~01&uOe z*UMJ<)HW1(<3CGNrM;|1Idkn9jWT`S+I;=Jxap-DWqQ7(QHB|5{7@|X8aK-5vk6I~ z3`f^C%J9!oqYR&FiPzu5p(*OHa+5a7ylwBrCbqI|r!6$JnP_1s^yJZHhu!+g`g=Gw zXZ?NW0@mMW@+j{xXZ?LTlh?ZbK7F!KC-M4wcqCE|ekLE)3+wOUkaXXoK* zP1!+BK7tlz=i!jd*?ILxQk%`riw`+0IXe$2mzbT0#8thX-qsO#9{`mSUd>{~NWsWa@AX)Akgc>TRRe^s#=p)h^EoguL9347G%Ht*rnE12}~@g~Iao7cq# z#J38Ar+>hCy#GBqr0U^!8?C=re=W6H4_~}HvSbe*QZCWMhs0IA#<(*cY;M286ZR_Y zCE>2lUll@g+(p_rYBk3_PME#^K6SoiA`_ZMJa-Mh#uJ(79o}TAFC1OF)EE9an#|R{ z#~kf;N+)w+h@MX7GWNfxe>k<+_xNSMSnUqUVjf6q}Lx-rPAxau>J1~ z8fC_|m+e=0;#;M?tVTI=?HP@7*(bimO&6^>zfq>=OB!Y15;w~5YuqTK4=E&#G8|po zD8oNTjWT?{C0>6Iho-2*l2wX2Z~uEgWRtE_MW?OyzxTu9Rk-Ya*tQ?9zemUBtiSJG z!20`49%Zb_S%06-+i)^ zL6=*94+)o8e_s$Tb2iA78?34l%NU@SX)h^UZk6Y+uDyAAqX;m2s?vFJ~!He7!X zEu}hhwEiB>9M<2XXQEM9#3F0_9|@8Jiuz+_^Y&+)cC9OR|VX>@fT^AYy9JcJO?mo1GUsOP8FThm=dq&O_p=USr%DyW_TJHnU{l zF4I=H-_-j1>{Vy5PPfEl%*&%KIR8hh`SN9Pv2iM}e@}<__<8)2e-~Y|&g8Rnz zJKsM3?D^xzZ~vFY|LQc}5O%)9os1n0+cP^IGaQy_-@L=}%vCeYaJcOL%%Z5Osc56+ z_n1X>p&5SD3vIUiUOXVnUVcAIleheyo+gTcI~fnyw`U$wXV@##R`%ZP^84(yX9!Fm zZ)Zfpng{IByQLYl(km!C_^?}~?BFNgmiw=(<@az%=l6y`yE}Yv>%+S@-@o<2aCrOP zqgy|{^p#df3JG!Yrl2;?D;334L^T!JpARC{ocl(uD1>z zo?gTK4~LH)+`3g>%kbt0AKZR)`~JP*=Jmr*j$hP29XK8JJiPtWd&Q;SeE-q#!1p=g`qswJi>tr4;Oc7&xbDJx-RQ#q;YxeI$b4(^ zb)e1q`%mks{5a5^(aPz~U*-SX_^0FY+po&+#p9!24iArypVpV;?D)BN{TELTU!0ZY zX<1v%X$HzU}uoLSsTUOBT~bY*vo+dO!c+w5*fxxH89R({)S^ZGn} z4}SJ=IJouTr?)1tE6QM*2vEL&ECRf@@lV?A(MnMXKdGwpi=!`_Hva8N<8NQxX7A3G zfD+jFgCd0d)n*}Ny4&l;z00fQvlVpf08__NQusyo(ZTz&TfxlMOKvp#Dc4HZ@lu;ZHUn!$F zApWGdwas&GZJJfwYNl9a{Yho4-rI=d#MUwqv70h=WgdMsXC2gHK&X(;DKm|7iROw0sDa_XlQhNTbj8QqgBS z?hN8^v1E3!u~*M-XI}J~e|6K|cA_!VQr>#PTsAP31 z@1m0El$msSXIsn5*~+WhIS=q^OL6(C-Hr#3i>r3dz(PFJss%~@xvyQpq5HTmNr>z~zSJ?R2+2WWe7A8M&h6`^ft4qGJC>w;}xp%Mm%ZAtjc6}R&Lr% z1{vd@-sbWQddW7d1`INWK)rqzgWL&vuNyzR%&U%GGSO&o+ZiLJUjG%UBiT>#cgN5F zv)=Hx>6bXVz0I9FqfzTBzfterul5%e)W7;?fB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`>bZ<2xfPXp}wGytaGB&M!~M+O*RfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7`Vy? z>OT!|!>0i>i>ur$+&ljaFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs# zzyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?mO)^mbX@EVS2Eg>2#MHI$ z$N&QjFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d| z0}L?000Rs#zyJdbFu(u<3^2d|16SF=#>>%ad%rzeS#KSz4Gxb7Cr86>ue(`%8({S+ zmy3HB7Z_lG0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1R*OVB_UzwY}dSt*o~W)&_^igOj6Sx7Y378I4wzQz$28p)43+fB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+~Nk8`yX`T5XS3 z+WYIRgSElo@!;fW*zI+DcSfVtt6VPbU0h&*0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMzf`N^%N3HgLdw;!kur@e69-JHvyS;Az z&S=z9Ore;Rg|c9P0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7`QA0^`8dV@@W7hFN>t}vc~`e z3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs# zzyJdbFu(u<3^2d|0}L?000Ru92I@ZzutJngcsw{c8g_fV%{!w}>#7!s`xXuuV1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0S3NF2I@Zz(Di8mOutD?T?>y4Fu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?0 z00Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L>5l?~K?8ldOX0Gh>B zZWivHe+C#}fB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz82Ba`sQ)xT-=_gE{U$MWEj%*700Rs#zyJdb zFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?0 z00Rs#zyJdbFu=f7Hn8z>wA$Wpk5<-O2Wx}Fqld@123^2d|0}L?000Rs#zyJdbFu(u< z3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs# zzyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d| z0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdb zFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?0 z00Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u< z3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#aFq>gyd15zM=S09_13}K;P7~G zay0DrdOLSUqt&ZiF791iV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0S1DBjju+lQI6NMl91Xj@-tL{zsHK=fF)0gW z!2kmcFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d| z0}L?000Rs#zyJdbFu(u<3^2d|0}L>5SqADq4RFJ!0g${blFrK>0}L?000Rs#zyJdb zFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?0 z00Rs#zyJdbFpwIk|1`j!PXnaIgpKlGfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{k<31GS$9 z=x_Qo0FswQ(s|iqfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_22ukXFGs8G{q|^Oy>+lQI6NMl z91Xj@ez*8Gz^bxJ%Sw4D4+a=ufB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7`UniHeQZa+xzX&%6jWyZE$!zI5`@2d;Q*>(P;ImmWlfo z4;Wy80R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1R+tz{bnbYJ0TO-d}GWtPKv22Pa3vZm-|JGa9WbtF)|?hw@;60R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1R+EYGC8*QLDY*-d}GW ztPKv22Pa3vZm+*}XEbVE)gp1%V+TL%ER@PewYlFk%!O79E+w0#bz74Q?mCMDwiwg`ezyJdbFu(u<3^2d| z0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdb zFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?0 z00Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u< z3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs# zzyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d| z0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2ezFtG7*wA$Wpk5<-O z2Wx}FV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h%ADjV2%Ia+OxR@(dPt%J3};ql<)XxQy-ZQdD; zRK+mTEki0CC&dVMH3^2d|0}L?000Rs#zyJdbFu(u< z3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs# zkQ%7}G(g{{0n%c^MtLy600Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d| z0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3|y9h`cDIF`7{8MmqpTf z*<*kK1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^vXl1>1ur@e69-JHvyS=UL;@beL z$|@}@<)J(nV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h%Asv6jMIa+P+w?`}Mt%J3};ql<)XxQy-?c5oSRwa0 zBLfUDzyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u< z3^2d|0}L?000Rs#zyJdbFu(u<3|wUcwVwvq-t=hz&EhII3-``H0}L?000Rs#zyJdb zFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?0 z00Rs#zyJdbe3J~+e;T0c(*T%$lbE^|9vNVO0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=VBjhn*myZw zZSS{7E9;N)o7?QQppZv(7e<#KWF;sOHtJngcsw{c8g_fzTX#mI)vH`C?p<79 zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z27-Z&uSc!+etUntb+9%#JRY1J4ZFSV?K`7UOEHCFQWnaB0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+~PC4Ag%bV8^Ecki0CC&dVMH3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?0 z00Rs#zyJdbFu(u<3^2d|0}L?000Rs#zyJdbFu(u<3^2d|0}L?000Rs#kQ%7}G{CM; z1Ej@-jq+fC0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7`QA0^`8c~;nM&}UKUB`Wsd;{7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XI4Q#v|t+w~uqm}j6!P?;PcyMww?Dn?zif;p~Dyy`tl!x+Q zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_ z1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;= zV1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~ z0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz z7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|Xg zfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`<_1{h#~0R|XgfB^;=V1NMz7+`>b zt7>55iTyNt_|OR_VmT^;nU;cvyx5ob||Nr{rf3Hsd|HfZEOZ(%Hw);w{dpW7M+HV!t9i6)$QomhZaryuE=U(wv zNVUZMZog9Mot)G&_q!WXXWZ|NkZOth-OEY+*UohOcBEr7C;9eS1;@!FUGwjF_N-Z~ zW|K#*7MIv;@+@Gp_Z$h!x#uievlkuNTIQa!kj>t6q%7l}ds(_>?ztD~TH>C2S;}Va zxfdzRx#uihv-ce7TIQa!l+E6Aq%7y2vvkegbEIpDd+ui`o4MzHq-_2@-)pzlTL%Zj z;m__4A3gZ!)(5x$#_++d5AWW5|JLBvy+^|jZ$5lD9Nholqq{dB-2V912gA+bqvt1& zZ$CYL{Hw>$KX~%%;ipf2{p7{7=XX~B|JA?y-bQwQ-3!jIcftAfFF3!g3(jx*g7e$C z;QV&a&F|xXxAW~MPm4j|wQHUAcI(>rJ1gzhudeT1>vY=1Kdlx&uYCM& zr#(149-JHvd)I!a^W)<8^(e{dF{`J?6hq8U4v&ZZsbk7x<72iO9JAfvn4Jd4>^3;& zMuTJa<{tAObiOya_wHtsgSt%)nl9>4jM3@lmvwZ|{KAe7nqS({LGz0{I%s}*M+eO< z@L&AJ&UeOBlscWC4?4SNVRI5LKNt>g-+OfHr?(yq9}FJeyZ>M?4xXFCC#V1W+4C=| zA!%)P_d9#_*%e{8$m#t@!@IZdjkD|yCvBjdK-Fe=x3kkc!<(Oedj948mClcb4<5hx z?C{IS!_c!#={aq>r#MzW|Gmz&@**1?{rjEYF2cm&(-&Vpd+~S}#y3;hSMEOF=&YX( z_D|b*RR(D|I6Zdd>{8Oq!|10b^RpQ9c}7k2%A2gNq9%@go>3FM;li5uk2*iBYN9%A zdh?+kP77)p%e-ULf|^@xp%$V_URY-I@Vpcx?%bg#jWtZM&IPhn4 z+wH^qfADB<^XFB&efaE`$6p+Nbsn+Pl3vLXvpG{8uPA0Sz3aN%;*e`guw+OhKmFZn*Z}jlxuve_g3$H1qyNS6Y8@rTvGU zzi@U?)DG+JqG2!8ZP*Ka*t{3oo+*?cb$(b3$bL}N+-ZfhI&ZZ{D@8ZixEtC|OC>M6 z;ltZ^KdPGUUq5{Q>5JbydGT2gS({zP{cc_O{cbh5-`kzuS=7A$VKG0ub*r454F@0H zeRTW7ySLxJ`RMljd&RWubWB{e)!KW_0KYSHuXXk^`n8?(R_k&r!AB9Q?y!d7dkw*7 z6v~Z@E|eRK3ndQSVF_h!_=}=zgQL^v8rDdrvM;;NB-BT>G~3bE4ZOlyN=^3J1FyY{ zu8Y0Ax@gc9MRn2O=(H}va#u}Zjl0Gu`-Mkm4!&mAWU9C_Th*r*v-0`xPCBco%bj;< z!{JxaVZDyw*X~8T+0An^x2nPrCY3O^8P~STbIX5b0M?+zrxz0zG^bdGtB(g@QT}z2 z{KL*q8c$H04V&lo4K;J<2IJQCi=Za*;|jOElvv*|(((+2Z}xIX3%E&K!`l$B?)PXHCat+YLLYoKec`^t5(Vkuyq}nxNL^*JNB)wBy$?E}LB(slCFSJDyq|sm-t9 zxU3i_U*x!~*Z?&BP^GRR%>_6+04Z`@_~55xA9Y_@XHD7fG-RI<4T_yf7hNn>uaRJ;ZrqoZW)hjB(Czgon;siWj(2!*&x(-Y&LgV5U!-**sNGucpYQ2Vx!_k z>zSHsFKo|qYb)rP;&AS0Yo>vo+ni^|W(_i*x}3R{9Bv>Pn?+}+E1aBYeBmSOUW1YK zUn({noc3h#sJiY(=Jt(gFSXmSm&zEN^=4 zm(*Ti&K*xJnAGOiaBx=iQ5QNmyYM+lufd$8a=&15)zJNohVFCX!R(|Yt6|m*TBqhD zb=f6jfw|E(1jD$iZg6-SZKJL6a~58biTSx>t-8@!w0FKH|LoCOvBUd<``04JBoHmE zk-3AX`Wk6)_*9KV&0%&tJMNC-DnGYMP7lvqIh)yTy>N7Px?}FF1DiV%%^sc=OW?0% zcs4uduB-345vQg>PffY&vP&BD-0BK^raa^=oy{FRoz}tZ@a){f$JnMWr@{C#8JT#xpE&h_`2inb#Bz} zNl4unvzunS`f~!p?7XDz3g@=W*>K*UnwHdM7l-rc#l)%MO$g^vXWiuRsc<`YL|68y ztDm2qpxjP;Zf8+DK%3hkot_|nfOg?iiGG8rL{cSli=e(r8XP`VC3A;#X_ZX+s>LJh z%mLcW`)kbyXwgzh{m^pd^M>nZ#Ur5CF+l6j&b#ZHU`~skns(P^m++Z8hz)$EJS;5L zw27WOzMmPPoj-x99ic@duC$iJ9mFHF_zZQWlM|3He0<$+FutyOs;FVs-AL1(s_D-E zj3L_A?7X#hfRQm^DY8uZ@Y?*E4AF|F{5pncvuh`{SD10f{i(&1+WZ<0(TX^8kwdht z3!j=4IVROx2fBhE4HD{(YYM;V8vq#*VQY47Qg?+J0lmm^VZkH;eQtOT;V>SeH8?zt z|HWivK@Xh9|8ozo8=@7Hv2#w4Jw)5O@aaU6V_ua+Q;K;#R9qzu4xg%$bNkqFrx~9h z>NwKtDMxgfL$sMs0-RHSr&qIb_ImDV8FtSdj*pLRI7BP9E4+>&+Scsc`&C=?)YQ8! zyQD?Wt*+3br##FZ@Mgf)?98=xfRQm^DY8uBVr_m+hG<1oehovkt=Z+1+AGYt`A|&rP;6<|fexuxdX&(G>n`csF2sc5YJFG&6>1MUIL4y6lpG9z}&T zfQEn`ch?OLpNh9p{7g?!AOGiu=hMS$hiK<^+u1|3?F*ky6glQq$=n;NuaX9bPgTj> zPCTuW$tqg(X|Jh;*7ESoA==DWmKqPy<}c!gOKCVnEB5NVmLb~g+`Fy`X0+(7si}8e zc5#cIUtNLEl!v+VuCi3qA=o13B@p^ULjWq44Uc*ZQ+p{y*+5tw!fThSXDVf^*nhep3ru-U)Xxp>PC$(3YbH`Kb zC$;%C9HJFN=8GJnZD07@q{uO;-a60~{Aku!bKJqodgoxR^WD=Ai{1b7Mez;D-|BP@ zpIm?T^yydEzc^Zd>)^+o_40ooKR1V(BWc^16f3fp+dHk2pAOHHv9^~Ya* z`S|&hFCITVK0GS^{om>QDE;@RpPZhlb$Vsbp8x9lmw$Hr?D;33T|Yd2{B*tbZvisQ B@m>G` literal 0 HcmV?d00001 diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz new file mode 100644 index 0000000000000000000000000000000000000000..07fda6854dc56be62ead10027bc9e4eab42d2308 GIT binary patch literal 526 zcmbV}F-yZx6op@`iWEV?j3VhEQ7FYiwNM0|L@1UZIMk(-Ce=aOlBNX-HJt=kb$8H7 zbXNy6I68F|>FOflAF%q}wu6g{FZpuLJvZ+qaII`)R9Ax)Q_nYMx2-89Oe$Sf8(yv9 z6_yf}+M%YV`?V`siyKx+o_jaGh>*m<&+sW%yQNB++w9rb!|uTjp9Mw zA+PTjYYz27GH!6Mq!!~w%lOxd-l=vt{~d%ckjD4WZJ5Cy^@1=&Cm;~WJ1{%+5BM%z z4eKaA2Jh59977(S$b3kEWNGyJi|^n+&?MiP!e5Zz Ty print(f"Warning: Empty input array for type inference for {values}!") return default - if isInteger(values): + # First check the numpy dtype - if it's a float type, use float even if values are integer-like + # This handles cases like [0.0, 0.0] which would otherwise be incorrectly typed as uint8_t + if np.issubdtype(values.dtype, np.floating): + return minimalFloatType(values) + elif isInteger(values): return minimalIntegerType(values) else: return minimalFloatType(values) diff --git a/TargetLibraries/Generic/inc/macros.h b/TargetLibraries/Generic/inc/macros.h index d97cfecb7c..0b5a0e51fb 100644 --- a/TargetLibraries/Generic/inc/macros.h +++ b/TargetLibraries/Generic/inc/macros.h @@ -7,22 +7,28 @@ #ifndef __DEEPLOY_BASIC_MATH_MACROS_HEADER_ #define __DEEPLOY_BASIC_MATH_MACROS_HEADER_ +#ifndef MAX #define MAX(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a > _b ? _a : _b; \ }) +#endif +#ifndef MIN #define MIN(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a < _b ? _a : _b; \ }) +#endif +#ifndef CLAMP #define CLAMP(x, low, high) \ (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) +#endif #define inf 1.0f / 0.0f diff --git a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h index e44d3c20c6..1305ba6bff 100644 --- a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h +++ b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h @@ -23,8 +23,13 @@ #include "snrt.h" +#include "kernel/Add.h" +#include "kernel/Div.h" #include "kernel/Gemm.h" +#include "kernel/HardSwish.h" #include "kernel/MatMul.h" +#include "kernel/Mul.h" +#include "kernel/RMSNrom.h" #include "kernel/RQGemm.h" #include "kernel/RQMatMul.h" #include "kernel/Softmax.h" diff --git a/TargetLibraries/Snitch/inc/kernel/Add.h b/TargetLibraries/Snitch/inc/kernel/Add.h new file mode 100644 index 0000000000..7a65e82712 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Add.h @@ -0,0 +1,21 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_ADD_KERNEL_HEADER_ +#define __DEEPLOY_MATH_ADD_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, uint32_t size); + +void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t *out_shape, uint32_t *strides1, + uint32_t *strides2, uint32_t ndim, uint32_t size); + +void Add_fp32_lastdim(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t outer_size, uint32_t inner_size); + +#endif // __DEEPLOY_MATH_ADD_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Div.h b/TargetLibraries/Snitch/inc/kernel/Div.h new file mode 100644 index 0000000000..e9b257a634 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Div.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ +#define __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Division (FP32) + * + * Computes: output[i] = input1[i] / input2[i] + * + * input1: Numerator tensor (float32) + * input2: Denominator tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size); + +/* + * Element-wise Division with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] / scalar + * + * input1: Numerator tensor (float32) + * scalar: Scalar denominator (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size); + +#endif // __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/HardSwish.h b/TargetLibraries/Snitch/inc/kernel/HardSwish.h new file mode 100644 index 0000000000..a0cfdaac12 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/HardSwish.h @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ +#define __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * HardSwish Activation Function + * + * Computes: HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + * + * Piecewise form: + * - When x <= -3: output = 0 + * - When -3 < x < 3: output = x * (x/6 + 0.5) + * - When x >= 3: output = x + * + * This is a computationally efficient approximation of Swish/SiLU activation + * commonly used in mobile neural networks and transformer models. + * + * data_in: Input tensor (FP32) + * data_out: Output tensor (FP32, same shape as input) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size); + +#endif // __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Mul.h b/TargetLibraries/Snitch/inc/kernel/Mul.h new file mode 100644 index 0000000000..d851e2e3bf --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Mul.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ +#define __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Multiplication (FP32) + * + * Computes: output[i] = input1[i] * input2[i] + * + * input1: First input tensor (float32) + * input2: Second input tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size); + +/* + * Element-wise Multiplication with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] * scalar + * + * input1: Input tensor (float32) + * scalar: Scalar multiplier (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size); + +#endif // __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/RMSNrom.h b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h new file mode 100644 index 0000000000..16e25cd38c --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ +#define __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * RMS Normalization (Root Mean Square Normalization) + * + * Computes: output[i] = (input[i] / rms) * weight[i] + * where rms = sqrt(mean(input^2) + eps) + * + * data_in: Input tensor [batch, seq, hidden] or flattened [size] + * weight: Weight tensor [hidden_dim] + * data_out: Output tensor (same shape as input) + * size: Total number of elements (batch * seq * hidden) + * lastDimLength: Hidden dimension size + * eps: Epsilon for numerical stability (typically 1e-6) + * + * multi-core = yes + * parallelization = vector-wise (across batch * sequence) + */ +void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out, + uint32_t size, uint32_t lastDimLength, float32_t eps); + +#endif // __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Softmax.h b/TargetLibraries/Snitch/inc/kernel/Softmax.h index c2d7596e7a..3795bb4f3b 100644 --- a/TargetLibraries/Snitch/inc/kernel/Softmax.h +++ b/TargetLibraries/Snitch/inc/kernel/Softmax.h @@ -9,7 +9,7 @@ #include "DeeploySnitchMath.h" -void softmax_fp32(float *input, float *output, int32_t ldI, +void Softmax_fp32(float *input, float *output, int32_t ldI, int32_t batch_offset, int32_t batch_size, int32_t seq_len, int32_t input_samples); diff --git a/TargetLibraries/Snitch/inc/macros.h b/TargetLibraries/Snitch/inc/macros.h index bc1191d25a..04bef2394b 100644 --- a/TargetLibraries/Snitch/inc/macros.h +++ b/TargetLibraries/Snitch/inc/macros.h @@ -8,10 +8,19 @@ #define __DEEPLOY_MATH_MACROS_HEADER_ #define INT_LOG2(x) __builtin_ctz(x) + +#ifndef CLAMP #define CLAMP(x, low, high) \ (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) +#endif + +#ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#ifndef MAX #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif // JUNGVI: The following macros are here to ensure compatibility with some // PULP-NN kernels diff --git a/TargetLibraries/Snitch/src/Add_fp32.c b/TargetLibraries/Snitch/src/Add_fp32.c new file mode 100644 index 0000000000..235b258511 --- /dev/null +++ b/TargetLibraries/Snitch/src/Add_fp32.c @@ -0,0 +1,102 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + pOut[i] = pIn1[i] + pIn2[i]; + } +} + +void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t *out_shape, uint32_t *strides1, + uint32_t *strides2, uint32_t ndim, uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t tmp = i; + + for (int32_t d = ndim - 1; d >= 0; d--) { + uint32_t coord = tmp % out_shape[d]; + tmp /= out_shape[d]; + idx1 += coord * strides1[d]; + idx2 += coord * strides2[d]; + } + + pOut[i] = pIn1[idx1] + pIn2[idx2]; + } +} + +void Add_fp32_lastdim(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t outer_size, uint32_t inner_size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + uint32_t size = outer_size * inner_size; + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + uint32_t inner_idx = i % inner_size; + pOut[i] = pIn1[i] + pIn2[inner_idx]; + } +} diff --git a/TargetLibraries/Snitch/src/CycleCounter.c b/TargetLibraries/Snitch/src/CycleCounter.c index 3861c421c1..8a99c312e6 100644 --- a/TargetLibraries/Snitch/src/CycleCounter.c +++ b/TargetLibraries/Snitch/src/CycleCounter.c @@ -6,10 +6,15 @@ #include "DeeploySnitchMath.h" +// Define ENABLE_INSTR_COUNTER to enable instruction counting (causes warnings +// in gvsoc) #define ENABLE_INSTR_COUNTER + static uint32_t timer_init[NUM_CORES] __attribute__((section(".l1"))); static uint32_t timer_end[NUM_CORES] __attribute__((section(".l1"))); +#ifdef ENABLE_INSTR_COUNTER static uint32_t instr_init[NUM_CORES] __attribute__((section(".l1"))); static uint32_t instr_end[NUM_CORES] __attribute__((section(".l1"))); +#endif static uint32_t running[NUM_CORES] __attribute__((section(".l1"))); @@ -17,11 +22,13 @@ void ResetTimer() { snrt_reset_perf_counter(SNRT_PERF_CNT0); uint32_t const core_id = snrt_global_core_idx(); uint32_t _timer_init = read_csr(mcycle); - uint32_t _instr_init = read_csr(minstret); timer_init[core_id] = _timer_init; - instr_init[core_id] = _instr_init; timer_end[core_id] = _timer_init; +#ifdef ENABLE_INSTR_COUNTER + uint32_t _instr_init = read_csr(minstret); + instr_init[core_id] = _instr_init; instr_end[core_id] = _instr_init; +#endif running[core_id] = 0; } @@ -31,7 +38,9 @@ void StartTimer() { } uint32_t const core_id = snrt_global_core_idx(); timer_init[core_id] = read_csr(mcycle); +#ifdef ENABLE_INSTR_COUNTER instr_init[core_id] = read_csr(minstret); +#endif running[core_id] = 1; } @@ -41,7 +50,9 @@ void StopTimer() { } uint32_t const core_id = snrt_global_core_idx(); timer_end[core_id] = read_csr(mcycle); +#ifdef ENABLE_INSTR_COUNTER instr_end[core_id] = read_csr(minstret); +#endif running[core_id] = 0; } @@ -55,6 +66,7 @@ uint32_t getCycles() { } uint32_t getInstr(void) { +#ifdef ENABLE_INSTR_COUNTER uint32_t const core_id = snrt_global_core_idx(); if (running[core_id]) { @@ -62,4 +74,7 @@ uint32_t getInstr(void) { } else { return instr_end[core_id] - instr_init[core_id]; } +#else + return 0; // Instruction counting disabled +#endif } \ No newline at end of file diff --git a/TargetLibraries/Snitch/src/Div_fp32.c b/TargetLibraries/Snitch/src/Div_fp32.c new file mode 100644 index 0000000000..07c3d3c5d4 --- /dev/null +++ b/TargetLibraries/Snitch/src/Div_fp32.c @@ -0,0 +1,89 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Division (FP32) + * + * Computes: output[i] = input1[i] / input2[i] + * + * Supports ONNX broadcasting rules: + * - If input2 is scalar (size=1): divides all elements of input1 by input2[0] + * - If both have same size: element-wise division + * + * input1: Numerator tensor (float32) + * input2: Denominator tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise across input1 + */ +void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize across elements + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + // Check if input2 is a scalar (size=1, broadcasted) + // Note: This assumes the parser has set input2_size correctly + // For now, we assume element-wise division (same size) + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] / input2[i]; + } +} + +/* + * Element-wise Division with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] / scalar + * + * input1: Numerator tensor (float32) + * scalar: Scalar denominator (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + float32_t inv_scalar = 1.0f / scalar; // Compute inverse once + + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * inv_scalar; + } +} diff --git a/TargetLibraries/Snitch/src/Gemm_fp32.c b/TargetLibraries/Snitch/src/Gemm_fp32.c index 9a79538e12..8dac98ef67 100644 --- a/TargetLibraries/Snitch/src/Gemm_fp32.c +++ b/TargetLibraries/Snitch/src/Gemm_fp32.c @@ -11,231 +11,50 @@ void gemm_fp32_transB_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A, uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C, uint32_t ldC, float32_t *Y, uint32_t BETA, uint32_t setup_SSR) { + (void)setup_SSR; uint32_t compute_id = snrt_global_compute_core_idx(); uint32_t A_offset = K * compute_id; uint32_t C_offset = N * compute_id; - // Unrolling factor of most inner loop. - // Should be at least as high as the FMA delay - // for maximum utilization - const uint32_t unroll = 8; - - // SSR strides and bounds only have to be configured - // once in the beginning - if (setup_SSR) { - // First matrix is not stored in transposed format - const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr0_i[4] = {0, sizeof(float32_t), 0, - sizeof(float32_t) * ldA}; - - // Second matrix is stored in transposed format - const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr1_i[4] = {sizeof(float32_t) * K, sizeof(float32_t), - sizeof(float32_t) * K * unroll, 0}; - - snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], - ssr0_i[2], ssr0_i[3]); - - snrt_ssr_repeat(SNRT_SSR_DM0, unroll); - snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], - ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); - } - - // SSR start address need to be configured each time - - snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, &A[A_offset]); - snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B); - snrt_ssr_enable(); - - // check dimensions and values of a and b - - // Kernel progresses by 1 values each step - // const uint32_t n_frep = K - 1; for (uint32_t m = 0; m < M; m++) { - uint32_t n = 0; - for (uint32_t n0 = 0; n0 < N / unroll; n0++) { - float c[unroll]; - - // Load intermediate result - if (BETA) { - c[0] = C[C_offset + m * ldC + n + 0]; - c[1] = C[C_offset + m * ldC + n + 1]; - c[2] = C[C_offset + m * ldC + n + 2]; - c[3] = C[C_offset + m * ldC + n + 3]; - c[4] = C[C_offset + m * ldC + n + 4]; - c[5] = C[C_offset + m * ldC + n + 5]; - c[6] = C[C_offset + m * ldC + n + 6]; - c[7] = C[C_offset + m * ldC + n + 7]; - } else { - c[0] = 0.0; - c[1] = 0.0; - c[2] = 0.0; - c[3] = 0.0; - c[4] = 0.0; - c[5] = 0.0; - c[6] = 0.0; - c[7] = 0.0; - } - - asm volatile( - "frep.o %[n_frep], 8, 0, 0 \n" - "fmadd.s %[c0], ft0, ft1, %[c0] \n" - "fmadd.s %[c1], ft0, ft1, %[c1] \n" - "fmadd.s %[c2], ft0, ft1, %[c2] \n" - "fmadd.s %[c3], ft0, ft1, %[c3] \n" - "fmadd.s %[c4], ft0, ft1, %[c4] \n" - "fmadd.s %[c5], ft0, ft1, %[c5] \n" - "fmadd.s %[c6], ft0, ft1, %[c6] \n" - "fmadd.s %[c7], ft0, ft1, %[c7] \n" - : [c0] "+f"(c[0]), [c1] "+f"(c[1]), [c2] "+f"(c[2]), [c3] "+f"(c[3]), - [c4] "+f"(c[4]), [c5] "+f"(c[5]), [c6] "+f"(c[6]), [c7] "+f"(c[7]) - : [n_frep] "r"(K - 1) - : "ft0", "ft1", "ft2"); - - // Store results back - Y[C_offset + m * ldC + n + 0] = c[0]; - Y[C_offset + m * ldC + n + 1] = c[1]; - Y[C_offset + m * ldC + n + 2] = c[2]; - Y[C_offset + m * ldC + n + 3] = c[3]; - Y[C_offset + m * ldC + n + 4] = c[4]; - Y[C_offset + m * ldC + n + 5] = c[5]; - Y[C_offset + m * ldC + n + 6] = c[6]; - Y[C_offset + m * ldC + n + 7] = c[7]; - n += unroll; - } - - // Clean up of leftover columns - snrt_ssr_disable(); - for (; n < N; n++) { + for (uint32_t n = 0; n < N; n++) { float32_t c; if (BETA) { c = C[C_offset + m * ldC + n]; } else { - c = 0.0; + c = 0.0f; } for (uint32_t k = 0; k < K; k++) { - c += A[A_offset + k + m * ldA] * B[k + n * ldB]; + c += A[A_offset + m * ldA + k] * B[n * ldB + k]; } Y[C_offset + m * ldC + n] = c; } - snrt_ssr_enable(); } - snrt_ssr_disable(); } void gemm_fp32_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A, uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C, uint32_t ldC, float32_t *Y, uint32_t BETA, uint32_t setup_SSR) { + (void)setup_SSR; + uint32_t compute_id = snrt_global_compute_core_idx(); uint32_t A_offset = K * compute_id; uint32_t C_offset = N * compute_id; - // Unrolling factor of most inner loop. - // Should be at least as high as the FMA delay - // for maximum utilization - const uint32_t unroll = 8; - - // SSR strides and bounds only have to be configured - // once in the beginning - if (setup_SSR) { - // First matrix is not stored in transposed format - const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr0_i[4] = {0, sizeof(float32_t), 0, - sizeof(float32_t) * ldA}; - - // Second matrix is not stored in transposed format - const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M}; - const uint32_t ssr1_i[4] = {sizeof(float32_t), sizeof(float32_t) * ldB, - sizeof(float32_t) * unroll, 0}; - - snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], - ssr0_i[2], ssr0_i[3]); - - snrt_ssr_repeat(SNRT_SSR_DM0, unroll); - snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], - ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); - } - - // SSR start address need to be configured each time - - snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, &A[A_offset]); - snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B); - snrt_ssr_enable(); - - // check dimensions and values of a and b - - // Kernel progresses by 1 values each step - // const uint32_t n_frep = K - 1; for (uint32_t m = 0; m < M; m++) { - uint32_t n = 0; - for (uint32_t n0 = 0; n0 < N / unroll; n0++) { - float c[unroll]; - - // Load intermediate result - if (BETA) { - c[0] = C[C_offset + m * ldC + n + 0]; - c[1] = C[C_offset + m * ldC + n + 1]; - c[2] = C[C_offset + m * ldC + n + 2]; - c[3] = C[C_offset + m * ldC + n + 3]; - c[4] = C[C_offset + m * ldC + n + 4]; - c[5] = C[C_offset + m * ldC + n + 5]; - c[6] = C[C_offset + m * ldC + n + 6]; - c[7] = C[C_offset + m * ldC + n + 7]; - } else { - c[0] = 0.0; - c[1] = 0.0; - c[2] = 0.0; - c[3] = 0.0; - c[4] = 0.0; - c[5] = 0.0; - c[6] = 0.0; - c[7] = 0.0; - } - - asm volatile( - "frep.o %[n_frep], 8, 0, 0 \n" - "fmadd.s %[c0], ft0, ft1, %[c0] \n" - "fmadd.s %[c1], ft0, ft1, %[c1] \n" - "fmadd.s %[c2], ft0, ft1, %[c2] \n" - "fmadd.s %[c3], ft0, ft1, %[c3] \n" - "fmadd.s %[c4], ft0, ft1, %[c4] \n" - "fmadd.s %[c5], ft0, ft1, %[c5] \n" - "fmadd.s %[c6], ft0, ft1, %[c6] \n" - "fmadd.s %[c7], ft0, ft1, %[c7] \n" - : [c0] "+f"(c[0]), [c1] "+f"(c[1]), [c2] "+f"(c[2]), [c3] "+f"(c[3]), - [c4] "+f"(c[4]), [c5] "+f"(c[5]), [c6] "+f"(c[6]), [c7] "+f"(c[7]) - : [n_frep] "r"(K - 1) - : "ft0", "ft1", "ft2"); - - // Store results back - Y[C_offset + m * ldC + n + 0] = c[0]; - Y[C_offset + m * ldC + n + 1] = c[1]; - Y[C_offset + m * ldC + n + 2] = c[2]; - Y[C_offset + m * ldC + n + 3] = c[3]; - Y[C_offset + m * ldC + n + 4] = c[4]; - Y[C_offset + m * ldC + n + 5] = c[5]; - Y[C_offset + m * ldC + n + 6] = c[6]; - Y[C_offset + m * ldC + n + 7] = c[7]; - n += unroll; - } - - // Clean up of leftover columns - snrt_ssr_disable(); - for (; n < N; n++) { + for (uint32_t n = 0; n < N; n++) { float32_t c; if (BETA) { c = C[C_offset + m * ldC + n]; } else { - c = 0.0; + c = 0.0f; } for (uint32_t k = 0; k < K; k++) { - c += A[A_offset + k + m * ldA] * B[k * ldB + n]; + c += A[A_offset + m * ldA + k] * B[k * ldB + n]; } Y[C_offset + m * ldC + n] = c; } - snrt_ssr_enable(); } - snrt_ssr_disable(); } diff --git a/TargetLibraries/Snitch/src/HardSwish.c b/TargetLibraries/Snitch/src/HardSwish.c new file mode 100644 index 0000000000..b7e9679c64 --- /dev/null +++ b/TargetLibraries/Snitch/src/HardSwish.c @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize by dividing work across cores + uint32_t chunk_size = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, end; + if (core_id < remainder) { + chunk_size += 1; + start = core_id * chunk_size; + } else { + start = core_id * chunk_size + remainder; + } + end = start + chunk_size; + + // HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + // Piecewise: + // x <= -3: output = 0 + // -3 < x < 3: output = x * (x/6 + 0.5) + // x >= 3: output = x + + for (uint32_t i = start; i < end; i++) { + float32_t x = data_in[i]; + float32_t clip_val = x / 6.0f + 0.5f; + + // Clamp to [0, 1] + if (clip_val < 0.0f) { + clip_val = 0.0f; + } else if (clip_val > 1.0f) { + clip_val = 1.0f; + } + + data_out[i] = x * clip_val; + } +} diff --git a/TargetLibraries/Snitch/src/Mul_fp32.c b/TargetLibraries/Snitch/src/Mul_fp32.c new file mode 100644 index 0000000000..80d6bc9b33 --- /dev/null +++ b/TargetLibraries/Snitch/src/Mul_fp32.c @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Multiplication (FP32) + * + * Computes: output[i] = input1[i] * input2[i] + * + * Supports ONNX broadcasting rules: + * - If input2 is scalar (size=1): multiplies all elements of input1 by + * input2[0] + * - If both have same size: element-wise multiplication + * + * input1: First input tensor (float32) + * input2: Second input tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise across input1 + */ +void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize across elements + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + // Element-wise multiplication + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * input2[i]; + } +} + +/* + * Element-wise Multiplication with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] * scalar + * + * input1: Input tensor (float32) + * scalar: Scalar multiplier (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * scalar; + } +} diff --git a/TargetLibraries/Snitch/src/RMSNrom_fp32.c b/TargetLibraries/Snitch/src/RMSNrom_fp32.c new file mode 100644 index 0000000000..9c615ce923 --- /dev/null +++ b/TargetLibraries/Snitch/src/RMSNrom_fp32.c @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" +#include + +void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out, + uint32_t size, uint32_t lastDimLength, float32_t eps) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t num_vectors = size / lastDimLength; + + // Parallelize across vectors (batch * sequence dimension) + uint32_t vectors_per_core = num_vectors / numThreads; + uint32_t remainder = num_vectors % numThreads; + + uint32_t start_vec, num_vecs; + if (core_id < remainder) { + num_vecs = vectors_per_core + 1; + start_vec = core_id * num_vecs; + } else { + num_vecs = vectors_per_core; + start_vec = core_id * vectors_per_core + remainder; + } + + for (uint32_t v = start_vec; v < start_vec + num_vecs; v++) { + float32_t *in_ptr = data_in + v * lastDimLength; + float32_t *out_ptr = data_out + v * lastDimLength; + + // Compute sum of squares + float32_t sum_sq = 0.0f; + for (uint32_t i = 0; i < lastDimLength; i++) { + sum_sq += in_ptr[i] * in_ptr[i]; + } + + // Compute RMS with epsilon + float32_t rms = sqrtf(sum_sq / (float32_t)lastDimLength + eps); + float32_t inv_rms = 1.0f / rms; + + // Apply normalization and weight + for (uint32_t i = 0; i < lastDimLength; i++) { + out_ptr[i] = in_ptr[i] * inv_rms * weight[i]; + } + } +} From 7c8f2d81d792adbbc72f9fc167e36d176dc8e53b Mon Sep 17 00:00:00 2001 From: lee2716 Date: Sat, 31 Jan 2026 12:11:34 +0100 Subject: [PATCH 2/2] Add tiling support for MicroLlama on Snitch Add SnitchTiledPlatform with TileConstraints for FP32 operators: - FloatDivTileConstraint: Division tiling with scalar broadcast - FloatMulTileConstraint: Multiplication tiling with scalar broadcast - ReshapeTileConstraint: Pass-through tiling for reshape Updates: - SnitchClusterTiling with tiled code transformation passes - Tiler.py with new tile constraints registration - platformMapping.py adds Snitch_tiled platform - testRunner_tiled_snitch.py for tiled model testing - CI workflows for both untiled and tiled Snitch --- .../workflows/ci-platform-snitch-tiled.yml | 5 + .github/workflows/ci-platform-snitch.yml | 5 + CMakeLists.txt | 8 +- .../SnitchClusterTiling.py | 24 ++- .../TileConstraints/FloatDivTileConstraint.py | 112 ++++++++++++++ .../TileConstraints/FloatMulTileConstraint.py | 112 ++++++++++++++ .../TileConstraints/ReshapeTileConstraint.py | 143 ++++++++++++++++++ .../Snitch/TileConstraints/__init__.py | 3 + Deeploy/Targets/Snitch/Tiler.py | 42 ++++- DeeployTest/testRunner_tiled_snitch.py | 5 +- DeeployTest/testUtils/platformMapping.py | 9 +- 11 files changed, 455 insertions(+), 13 deletions(-) create mode 100644 Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py create mode 100644 Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py create mode 100644 Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py diff --git a/.github/workflows/ci-platform-snitch-tiled.yml b/.github/workflows/ci-platform-snitch-tiled.yml index 3850ce2bde..4ebb9aba9a 100644 --- a/.github/workflows/ci-platform-snitch-tiled.yml +++ b/.github/workflows/ci-platform-snitch-tiled.yml @@ -41,6 +41,11 @@ jobs: {"name":"Kernels/Integer/Softmax/Large","L1":[5000,10000]}, {"name":"Kernels/FP32/Softmax/Regular","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/RMSNorm_fused","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/MatMul","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/Add/Regular","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/Hardswish","L1":[2000,5000,10000]}, + {"name":"Kernels/FP32/Div","L1":[2000,5000,10000]}, {"name":"Kernels/FP32/GEMM/Regular","L1":[2000,5000,10000]}, {"name":"Kernels/FP32/GEMM/TransB","L1":[2000,5000,10000]}, diff --git a/.github/workflows/ci-platform-snitch.yml b/.github/workflows/ci-platform-snitch.yml index 21f436b2a6..f3a1f8722f 100644 --- a/.github/workflows/ci-platform-snitch.yml +++ b/.github/workflows/ci-platform-snitch.yml @@ -37,6 +37,11 @@ jobs: docker-image: ${{ needs.select-env.outputs.image }} test-names: | Kernels/FP32/Softmax/Regular + Kernels/FP32/RMSNorm_fused + Kernels/FP32/MatMul + Kernels/FP32/Add/Regular + Kernels/FP32/Hardswish + Kernels/FP32/Div Kernels/Integer/Add/Large Kernels/Integer/Add/Regular diff --git a/CMakeLists.txt b/CMakeLists.txt index 70dec13084..e675a648cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,8 +19,8 @@ if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch, Snitch_tiled)") +set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch Snitch_tiled) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -36,6 +36,8 @@ elseif(platform STREQUAL Generic) message(STATUS "Building for platform 'Generic'") elseif(platform STREQUAL Snitch) message(STATUS "Building for platform 'Snitch'") +elseif(platform STREQUAL Snitch_tiled) + message(STATUS "Building for platform 'Snitch_tiled'") elseif(platform STREQUAL SoftHier) message(STATUS "Building for platform 'SoftHier'") elseif(platform STREQUAL Chimera) @@ -211,7 +213,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor endif() -if(platform STREQUAL Snitch) +if(platform STREQUAL Snitch OR platform STREQUAL Snitch_tiled) if(TOOLCHAIN STREQUAL LLVM) set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/snitch/toolchain_llvm.cmake) diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py index e8204f6ae2..a3e10ed188 100644 --- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py +++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py @@ -23,15 +23,31 @@ class SnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration): class ProfilingSnitchClusterTilingSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn): _printCycleDifference = NodeTemplate(r""" - printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ - ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + printf("%s%u][Core %d] %s%6u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ + ${measurement}, ${suffixStr}); + """) + + _printCycleContribution = NodeTemplate(r""" + uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput}; + uint32_t dma = ${measurementInput} + ${measurementOutput}; + float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total; + float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total; + printf("%s%u][Core %d] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\n", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), total, kernel_percentage, overhead_percentage, ${measurementKernel}, dma); """) class ProfilingSnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn): _printCycleDifference = NodeTemplate(r""" - printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ - ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr}); + printf("%s%u][Core %d] %s%6u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \ + ${measurement}, ${suffixStr}); + """) + + _printCycleContribution = NodeTemplate(r""" + uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput}; + uint32_t dma = ${measurementInput} + ${measurementOutput}; + float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total; + float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total; + printf("%s%u][Core %d] Total :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\n", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), total, kernel_percentage, overhead_percentage, ${measurementKernel}, dma); """) diff --git a/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py new file mode 100644 index 0000000000..b9b07be30a --- /dev/null +++ b/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py @@ -0,0 +1,112 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class FloatDivTileConstraint(TileConstraint): + """Tile constraint for FP32 Div operation supporting scalar broadcasting.""" + + dataIn1Name = "input1" + dataIn2Name = "input2" + dataOutName = "output" + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBuffer1Name = parseDict[cls.dataIn1Name] + inputBuffer2Name = parseDict[cls.dataIn2Name] + outputBufferName = parseDict[cls.dataOutName] + + input1Shape = ctxt.lookup(inputBuffer1Name).shape + input2Shape = ctxt.lookup(inputBuffer2Name).shape + + # Add tensor dimensions to model + tilerModel.addTensorDimToModel(ctxt, inputBuffer1Name) + tilerModel.addTensorDimToModel(ctxt, outputBufferName) + + # Check if input2 is scalar (total size == 1) + is_scalar = np.prod(input2Shape) == 1 + + if is_scalar: + # Scalar broadcasting: input2 is a scalar, don't tile it + # Only add input2 dimensions if it has more than 0 dims + if len(input2Shape) > 0: + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + # Constrain scalar to remain untiled (size 1) + for dim in range(len(input2Shape)): + input2DimVar = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + tilerModel.addConstraint(input2DimVar == input2Shape[dim]) + + # Input1 and output must have same dimensions + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + else: + # Element-wise: both inputs must have same shape + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + + tilerModel.addConstraint(inputDim1Var == inputDim2Var) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + # Check if scalar broadcasting + input2Name = operatorRepresentation[cls.dataIn2Name] + input2Shape = ctxt.lookup(input2Name).shape + is_scalar = np.prod(input2Shape) == 1 + + for cube in outputCubes: + newSize = np.prod(cube.dims) + replacements["size"].append(newSize) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + if is_scalar: + # For scalar, load the entire scalar tensor (size 1) + scalarCube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape)) + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: scalarCube}) + else: + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) + + for out in outputCubes: + outputLoadSchedule.append({cls.dataOutName: out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py new file mode 100644 index 0000000000..99df639004 --- /dev/null +++ b/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py @@ -0,0 +1,112 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class FloatMulTileConstraint(TileConstraint): + """Tile constraint for FP32 Mul operation supporting scalar broadcasting.""" + + dataIn1Name = "A" + dataIn2Name = "B" + dataOutName = "C" + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBuffer1Name = parseDict[cls.dataIn1Name] + inputBuffer2Name = parseDict[cls.dataIn2Name] + outputBufferName = parseDict[cls.dataOutName] + + input1Shape = ctxt.lookup(inputBuffer1Name).shape + input2Shape = ctxt.lookup(inputBuffer2Name).shape + + # Add tensor dimensions to model + tilerModel.addTensorDimToModel(ctxt, inputBuffer1Name) + tilerModel.addTensorDimToModel(ctxt, outputBufferName) + + # Check if input2 is scalar (total size == 1) + is_scalar = np.prod(input2Shape) == 1 + + if is_scalar: + # Scalar broadcasting: input2 is a scalar, don't tile it + # Only add input2 dimensions if it has more than 0 dims + if len(input2Shape) > 0: + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + # Constrain scalar to remain untiled (size 1) + for dim in range(len(input2Shape)): + input2DimVar = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + tilerModel.addConstraint(input2DimVar == input2Shape[dim]) + + # Input1 and output must have same dimensions + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + else: + # Element-wise: both inputs must have same shape + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + + tilerModel.addConstraint(inputDim1Var == inputDim2Var) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + # Check if scalar broadcasting + input2Name = operatorRepresentation[cls.dataIn2Name] + input2Shape = ctxt.lookup(input2Name).shape + is_scalar = np.prod(input2Shape) == 1 + + for cube in outputCubes: + newSize = np.prod(cube.dims) + replacements["size"].append(newSize) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + if is_scalar: + # For scalar, load the entire scalar tensor (size 1) + scalarCube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape)) + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: scalarCube}) + else: + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) + + for out in outputCubes: + outputLoadSchedule.append({cls.dataOutName: out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py new file mode 100644 index 0000000000..1bafa36e3b --- /dev/null +++ b/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py @@ -0,0 +1,143 @@ +# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class ReshapeTileConstraint(TileConstraint): + """Tile constraint for Reshape operation - a NOP that just reinterprets data layout.""" + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + pointer: List[str] = [] + + for key, value in parseDict.items(): + if not isinstance(value, str): + continue + + if ctxt.is_global(value) or ctxt.is_local(value): + pointer.append(value) + + # Add I/O dimensions to the model as variables + for bufferName in [inputBufferName, outputBufferName]: + _buffer = ctxt.lookup(bufferName) + tilerModel.addTensorDimToModel(ctxt, bufferName) + + for idx, shapeDim in enumerate(_buffer.shape): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = bufferName, dimIdx = idx) <= shapeDim) + + # Constrain total elements to be equal + inputBuffer = ctxt.lookup(inputBufferName) + outputBuffer = ctxt.lookup(outputBufferName) + + # For reshape, we want the tiles to have the same total number of elements + # This is automatically satisfied if we tile based on output and compute input from that + + # Remove unused tensors from deployment + for bufferName in pointer: + if bufferName not in [inputBufferName, outputBufferName]: + ctxt.lookup(bufferName)._deploy = False + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # For reshape, input and output have the same data, just different interpretations + # We need to compute the corresponding input cube for each output cube + inputName = operatorRepresentation['data_in'] + outputName = operatorRepresentation['data_out'] + inputShape = ctxt.lookup(inputName).shape + outputShape = ctxt.lookup(outputName).shape + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + # Calculate the flat offset and size for the output cube + outSize = np.prod(cube.dims) + replacements["size"].append(outSize) + + # For reshape, we need to map output cube to input cube + # Calculate flat index range for output cube + outOffset = 0 + outStrides = [] + stride = 1 + for dim in reversed(outputShape): + outStrides.insert(0, stride) + stride *= dim + + for i, (off, dim) in enumerate(zip(cube.offset, cube.dims)): + outOffset += off * outStrides[i] + + # Convert flat offset to input coordinates + inStrides = [] + stride = 1 + for dim in reversed(inputShape): + inStrides.insert(0, stride) + stride *= dim + + inOffset = [] + remaining = outOffset + for i, stride in enumerate(inStrides): + inOffset.append(remaining // stride) + remaining = remaining % stride + + # Calculate input cube dimensions + # For simplicity, treat as 1D cube in input space + inCubeDims = list(inputShape) + inCubeOffset = [0] * len(inputShape) + + # Set the last dimension to the size, and offset based on flat index + totalSize = outSize + if len(inputShape) > 0: + # Compute proper input cube that covers the same elements + # Use a simple approach: linearize the input + inCubeOffset = list(inOffset) + inCubeDims = [1] * len(inputShape) + inCubeDims[-1] = min(totalSize, inputShape[-1] - inCubeOffset[-1]) + remaining = totalSize - inCubeDims[-1] + + for i in range(len(inputShape) - 2, -1, -1): + if remaining <= 0: + break + inCubeDims[i] = min(remaining // np.prod(inputShape[i + 1:]) + 1, inputShape[i]) + remaining -= (inCubeDims[i] - 1) * np.prod(inputShape[i + 1:]) + + inputCube = HyperRectangle(tuple(inCubeOffset), tuple(inCubeDims)) + inputLoadSchedule.append({"data_in": inputCube}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Snitch/TileConstraints/__init__.py b/Deeploy/Targets/Snitch/TileConstraints/__init__.py index 947a6fd82a..aece19d881 100644 --- a/Deeploy/Targets/Snitch/TileConstraints/__init__.py +++ b/Deeploy/Targets/Snitch/TileConstraints/__init__.py @@ -3,5 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from . import * +from .FloatDivTileConstraint import * +from .FloatMulTileConstraint import * from .iNoNormTileConstraint import * from .iSoftmaxTileConstraint import * +from .ReshapeTileConstraint import * diff --git a/Deeploy/Targets/Snitch/Tiler.py b/Deeploy/Targets/Snitch/Tiler.py index 475a425779..5a5f4d0bf4 100644 --- a/Deeploy/Targets/Snitch/Tiler.py +++ b/Deeploy/Targets/Snitch/Tiler.py @@ -3,10 +3,21 @@ # SPDX-License-Identifier: Apache-2.0 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint -from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, \ - SnitchiSoftmaxBindings, SnitchRQAddBindings, SnitchRqGemmBindings +from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iRMSNormTileConstraint import iRMSNormTileConstraint +from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint +from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchConcatBindings, SnitchDivBindings, \ + SnitchGatherBindings, SnitchGemmBindings, SnitchHardSwishBindings, SnitchiNoNormBindings, SnitchiSoftmaxBindings, \ + SnitchMatMulBindings, SnitchMulBindings, SnitchReshapeBindings, SnitchRMSNormBindings, SnitchRQAddBindings, \ + SnitchRqGemmBindings, SnitchTransposeBindings from Deeploy.Targets.Snitch.TileConstraints import iNoNormTileConstraint, iSoftmaxTileConstraint +from Deeploy.Targets.Snitch.TileConstraints.FloatDivTileConstraint import FloatDivTileConstraint +from Deeploy.Targets.Snitch.TileConstraints.FloatMulTileConstraint import FloatMulTileConstraint from Deeploy.Targets.Snitch.TileConstraints.GemmTileConstraint import GemmTileConstraint +from Deeploy.Targets.Snitch.TileConstraints.ReshapeTileConstraint import ReshapeTileConstraint from Deeploy.Targets.Snitch.TileConstraints.RqGemmTileConstraint import RqGemmTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings @@ -23,3 +34,30 @@ SnitchAddTileReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchAddBindings, tileConstraint = AddTileConstraint()) + +SnitchRMSNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchRMSNormBindings, + tileConstraint = iRMSNormTileConstraint()) + +SnitchHardSwishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchHardSwishBindings, + tileConstraint = iHardswishTileConstraint()) + +SnitchDivTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchDivBindings, + tileConstraint = FloatDivTileConstraint()) + +SnitchMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMulBindings, + tileConstraint = FloatMulTileConstraint()) + +SnitchMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMatMulBindings, + tileConstraint = MatMulTileConstraint()) + +SnitchConcatTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchConcatBindings, + tileConstraint = ConcatTileConstraint()) + +SnitchTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchTransposeBindings, + tileConstraint = TransposeTileConstraint()) + +SnitchReshapeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchReshapeBindings, + tileConstraint = ReshapeTileConstraint()) + +SnitchGatherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchGatherBindings, + tileConstraint = GatherTileConstraint()) diff --git a/DeeployTest/testRunner_tiled_snitch.py b/DeeployTest/testRunner_tiled_snitch.py index 7787d1f844..cf6ac6b2e0 100644 --- a/DeeployTest/testRunner_tiled_snitch.py +++ b/DeeployTest/testRunner_tiled_snitch.py @@ -25,7 +25,10 @@ args = parser.parse_args() - testRunner = TestRunner(platform = "Snitch", simulator = args.simulator, tiling = True, argument_parser = parser) + testRunner = TestRunner(platform = "Snitch_tiled", + simulator = args.simulator, + tiling = True, + argument_parser = parser) testRunner.cmake_args += f" -D NUM_CORES={args.cores}" diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 48c5777905..9d562cf577 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -24,12 +24,12 @@ from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform from Deeploy.Targets.Snitch.Deployer import SnitchDeployer -from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform +from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform, SnitchTiledPlatform from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Snitch_tiled", "Chimera"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -65,6 +65,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Snitch": Platform = SnitchPlatform() + elif platformName == "Snitch_tiled": + Platform = SnitchTiledPlatform() + elif platformName == "SoftHier": Platform = SoftHierPlatform() @@ -217,7 +220,7 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) - elif isinstance(platform, (SnitchPlatform)): + elif isinstance(platform, (SnitchPlatform, SnitchTiledPlatform)): if loweringOptimizer is None: loweringOptimizer = SnitchOptimizer