diff --git a/.github/workflows/ci-platform-snitch-tiled.yml b/.github/workflows/ci-platform-snitch-tiled.yml
index 3850ce2bde..4ebb9aba9a 100644
--- a/.github/workflows/ci-platform-snitch-tiled.yml
+++ b/.github/workflows/ci-platform-snitch-tiled.yml
@@ -41,6 +41,11 @@ jobs:
           {"name":"Kernels/Integer/Softmax/Large","L1":[5000,10000]},
 
           {"name":"Kernels/FP32/Softmax/Regular","L1":[2000,5000,10000]},
+          {"name":"Kernels/FP32/RMSNorm_fused","L1":[2000,5000,10000]},
+          {"name":"Kernels/FP32/MatMul","L1":[2000,5000,10000]},
+          {"name":"Kernels/FP32/Add/Regular","L1":[2000,5000,10000]},
+          {"name":"Kernels/FP32/Hardswish","L1":[2000,5000,10000]},
+          {"name":"Kernels/FP32/Div","L1":[2000,5000,10000]},
 
           {"name":"Kernels/FP32/GEMM/Regular","L1":[2000,5000,10000]},
           {"name":"Kernels/FP32/GEMM/TransB","L1":[2000,5000,10000]},
diff --git a/.github/workflows/ci-platform-snitch.yml b/.github/workflows/ci-platform-snitch.yml
index 21f436b2a6..f3a1f8722f 100644
--- a/.github/workflows/ci-platform-snitch.yml
+++ b/.github/workflows/ci-platform-snitch.yml
@@ -37,6 +37,11 @@ jobs:
       docker-image: ${{ needs.select-env.outputs.image }}
       test-names: |
         Kernels/FP32/Softmax/Regular
+        Kernels/FP32/RMSNorm_fused
+        Kernels/FP32/MatMul
+        Kernels/FP32/Add/Regular
+        Kernels/FP32/Hardswish
+        Kernels/FP32/Div
 
         Kernels/Integer/Add/Large
         Kernels/Integer/Add/Regular
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70dec13084..e675a648cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,8 +19,8 @@ if(TOOLCHAIN STREQUAL GCC)
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch, Snitch_tiled)")
+set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch Snitch_tiled)
 
 if(platform STREQUAL MemPool)
   message(STATUS "Building for platform 'MemPool'")
@@ -36,6 +36,8 @@ elseif(platform STREQUAL Generic)
   message(STATUS "Building for platform 'Generic'")
 elseif(platform STREQUAL Snitch)
   message(STATUS "Building for platform 'Snitch'")
+elseif(platform STREQUAL Snitch_tiled)
+  message(STATUS "Building for platform 'Snitch_tiled'")
 elseif(platform STREQUAL SoftHier)
   message(STATUS "Building for platform 'SoftHier'")
 elseif(platform STREQUAL Chimera)
@@ -211,7 +213,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor
 
 endif()
 
-if(platform STREQUAL Snitch)
+if(platform STREQUAL Snitch OR platform STREQUAL Snitch_tiled)
 
   if(TOOLCHAIN STREQUAL LLVM)
     set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/snitch/toolchain_llvm.cmake)
diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
index ec2ed6270f..221a797dab 100644
--- a/Deeploy/Targets/Generic/Bindings.py
+++ b/Deeploy/Targets/Generic/Bindings.py
@@ -283,6 +283,9 @@
 BasicConcatBindings = [
     NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
                 ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                ConcatTemplate.referenceTemplate, BasicTransformer)
 ]
 
 BasicQuantBindings = [
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index cc733937cc..26dd5746c9 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -709,3 +709,31 @@ def computeOps(self):
             numPx = opRep['dim_im_out_x']
 
         return numPx * opsPerPx
+
+
+class RMSNormLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        # RMSNorm: square, mean, sqrt, div, mul
+        size = self.mapper.parser.operatorRepresentation['size']
+        lastDimLength = self.mapper.parser.operatorRepresentation['lastDimLength']
+        batch_size = size // lastDimLength
+
+        # square + sum + mean + eps + sqrt + div + mul
+        ops = size + batch_size * lastDimLength + batch_size * 4 + size * 2
+        return ops
+
+
+class HardSwishLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        # HardSwish(x) = x * clip(x/6 + 0.5, 0, 1)
+        # Operations: div + add + clip + mul
+        size = self.mapper.parser.operatorRepresentation['size']
+        return size * 4
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index cf1ba776bd..f0abefd4f6 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -467,23 +467,62 @@ def __init__(self):
         super().__init__()
 
     def parseNode(self, node: gs.Node) -> bool:
-
         ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
-
         return ret
 
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
-
         data_in_1 = ctxt.lookup(node.inputs[0].name)
         data_in_2 = ctxt.lookup(node.inputs[1].name)
         data_out = ctxt.lookup(node.outputs[0].name)
+
         self.operatorRepresentation['data_in_1'] = data_in_1.name
         self.operatorRepresentation['data_in_2'] = data_in_2.name
         self.operatorRepresentation['data_out'] = data_out.name
-        self.operatorRepresentation['size'] = np.prod(data_in_1.shape)
+        self.operatorRepresentation['size'] = np.prod(data_out.shape)
+
+        # Check if broadcasting is needed
+        shape1 = list(data_in_1.shape)
+        shape2 = list(data_in_2.shape)
+        out_shape = list(data_out.shape)
+
+        need_broadcast = (shape1 != out_shape) or (shape2 != out_shape)
+        self.operatorRepresentation['need_broadcast'] = need_broadcast
+
+        if need_broadcast:
+            # Calculate strides for broadcasting
+            ndim = len(out_shape)
+
+            # Compute strides for input 1
+            strides1 = [1] * ndim
+            for i in range(ndim - 1, -1, -1):
+                if i < len(shape1) and shape1[i] == out_shape[i]:
+                    if i == ndim - 1:
+                        strides1[i] = 1
+                    else:
+                        strides1[i] = strides1[i + 1] * shape1[i + 1] if (
+                            i + 1 < len(shape1) and shape1[i + 1] == out_shape[i + 1]) else strides1[i + 1]
+                else:
+                    strides1[i] = 0  # Broadcast dimension
+
+            # Compute strides for input 2
+            strides2 = [1] * ndim
+            for i in range(ndim - 1, -1, -1):
+                if i < len(shape2) and shape2[i] == out_shape[i]:
+                    if i == ndim - 1:
+                        strides2[i] = 1
+                    else:
+                        strides2[i] = strides2[i + 1] * shape2[i + 1] if (
+                            i + 1 < len(shape2) and shape2[i + 1] == out_shape[i + 1]) else strides2[i + 1]
+                else:
+                    strides2[i] = 0  # Broadcast dimension
+
+            self.operatorRepresentation['ndim'] = ndim
+            self.operatorRepresentation['strides1'] = strides1
+            self.operatorRepresentation['strides2'] = strides2
+            self.operatorRepresentation['out_shape'] = out_shape
 
         return ctxt, True
 
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
index c2c8d436f8..6b3ff546b3 100644
--- a/Deeploy/Targets/Generic/TypeCheckers.py
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -102,6 +102,20 @@ def _inferSignedness(self, inputs: List[VariableBuffer],
             return [False]
 
 
+class FloatAddChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+
+
 class GatherChecker(SignPropTypeChecker):
 
     def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
@@ -610,3 +624,40 @@ def _inferNumLevels(self, inputs: List[VariableBuffer],
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> List[bool]:
         return [True]
+
+
+class RMSNormChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        # RMSNorm: square, mean, sqrt, reciprocal, multiply
+        # Output precision similar to input
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        # RMSNorm output can be signed (depending on input signedness)
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
+
+
+class HardSwishChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        if inputs[0]._signed:
+            return [True]
+        else:
+            return [False]
diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py
index 25b150b553..e442f63038 100644
--- a/Deeploy/Targets/Snitch/Bindings.py
+++ b/Deeploy/Targets/Snitch/Bindings.py
@@ -11,12 +11,20 @@
 from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.Generic.Templates import iNoNormTemplate
-from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker
+from Deeploy.Targets.Generic.Templates import ConcatTemplate, iNoNormTemplate
+from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, DivChecker, GatherChecker, GEMMChecker, \
+    HardSwishChecker, MatMulChecker, MulChecker, ReshapeChecker, RMSNormChecker, RQAddChecker, SoftmaxChecker, \
+    TransposeChecker, iNoNormChecker
 from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \
     SnitchSynchCoresPass
 from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma
-from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate
+from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, FloatMatMulTemplate, GatherTemplate, \
+    MatMulTemplate, ReshapeTemplate, RQAddTemplate, TransposeTemplate, iSoftmaxTemplate
+from Deeploy.Targets.Snitch.Templates.FloatAddTemplate import referenceTemplate as FloatAddTemplate
+from Deeploy.Targets.Snitch.Templates.FloatDivTemplate import referenceTemplate as FloatDivTemplate
+from Deeploy.Targets.Snitch.Templates.FloatHardSwishTemplate import referenceTemplate as FloatHardSwishTemplate
+from Deeploy.Targets.Snitch.Templates.FloatMulTemplate import referenceTemplate as FloatMulTemplate
+from Deeploy.Targets.Snitch.Templates.FloatRMSNormTemplate import referenceTemplate as FloatRMSNormTemplate
 from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template
 from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template
 from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template
@@ -45,6 +53,7 @@
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+    MemoryManagementGeneration("L2"),
     MemoryManagementGeneration()
 ])
 
@@ -69,7 +78,18 @@
 SnitchAddBindings = [
     NodeBinding(AddChecker([PointerClass(_type), PointerClass(_type)], [PointerClass(int32_t)]),
                 AddTemplate.referenceTemplate, TiledTransformer) for _type in [int8_t]
+] + [
+    # fp32 support
+    NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatAddTemplate, TiledTransformer)
+]
+
+# Basic (non-tiled) FP32 Add Bindings
+BasicAddBindings = [
+    NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatAddTemplate, BasicTransformer)
 ]
+
 SnitchGemmBindings = [
     NodeBinding(
         GEMMChecker([PointerClass(int8_t), PointerClass(int8_t),
@@ -90,3 +110,99 @@
             PointerClass(int32_t)
         ], [PointerClass(int8_t)]), SnitchRqGemm_Template, TiledTransformer)
 ]
+
+# RMSNorm Bindings (Tiled)
+SnitchRMSNormBindings = [
+    NodeBinding(RMSNormChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatRMSNormTemplate, TiledTransformer)
+]
+
+# RMSNorm Bindings (Non-tiled)
+BasicRMSNormBindings = [
+    NodeBinding(RMSNormChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatRMSNormTemplate, BasicTransformer)
+]
+
+# HardSwish Bindings (Tiled)
+SnitchHardSwishBindings = [
+    NodeBinding(HardSwishChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatHardSwishTemplate,
+                TiledTransformer)
+]
+
+# HardSwish Bindings (Non-tiled)
+BasicHardSwishBindings = [
+    NodeBinding(HardSwishChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatHardSwishTemplate,
+                BasicTransformer)
+]
+
+# Div Bindings (Tiled)
+SnitchDivBindings = [
+    NodeBinding(DivChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatDivTemplate, TiledTransformer)
+]
+
+# Div Bindings (Non-tiled)
+BasicDivBindings = [
+    NodeBinding(DivChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatDivTemplate, BasicTransformer)
+]
+
+# Mul Bindings (Tiled)
+SnitchMulBindings = [
+    NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatMulTemplate, TiledTransformer)
+]
+
+# Mul Bindings (Non-tiled)
+BasicMulBindings = [
+    NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatMulTemplate, BasicTransformer)
+]
+
+# MatMul Bindings (Tiled)
+SnitchMatMulBindings = [
+    NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                MatMulTemplate.referenceTemplate, TiledTransformer),
+    NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatMatMulTemplate.referenceTemplate, TiledTransformer)
+]
+
+# Concat Bindings (Tiled)
+SnitchConcatBindings = [
+    NodeBinding(ConcatChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int8_t)]),
+                ConcatTemplate.referenceTemplate, TiledTransformer),
+    NodeBinding(ConcatChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]),
+                ConcatTemplate.referenceTemplate, TiledTransformer),
+    NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                ConcatTemplate.referenceTemplate, TiledTransformer)
+]
+
+# Transpose Bindings (Tiled)
+SnitchTransposeBindings = [
+    NodeBinding(TransposeChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), TransposeTemplate.referenceTemplate,
+                TiledTransformer),
+    NodeBinding(TransposeChecker([PointerClass(int32_t)], [PointerClass(int32_t)]), TransposeTemplate.referenceTemplate,
+                TiledTransformer),
+    NodeBinding(TransposeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                TransposeTemplate.referenceTemplate, TiledTransformer)
+]
+
+# Reshape Bindings (Tiled)
+SnitchReshapeBindings = [
+    NodeBinding(ReshapeChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), ReshapeTemplate.referenceTemplate,
+                TiledTransformer),
+    NodeBinding(ReshapeChecker([PointerClass(int32_t)], [PointerClass(int32_t)]), ReshapeTemplate.referenceTemplate,
+                TiledTransformer),
+    NodeBinding(ReshapeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), ReshapeTemplate.referenceTemplate,
+                TiledTransformer)
+]
+
+# Gather Bindings (Tiled)
+SnitchGatherBindings = [
+    NodeBinding(GatherChecker([PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int8_t)]),
+                GatherTemplate.referenceTemplate, TiledTransformer),
+    NodeBinding(GatherChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]),
+                GatherTemplate.referenceTemplate, TiledTransformer),
+    NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(int32_t)], [PointerClass(float32_t)]),
+                GatherTemplate.referenceTemplate, TiledTransformer)
+]
diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py
index e8204f6ae2..a3e10ed188 100644
--- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py
+++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py
@@ -23,15 +23,31 @@ class SnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration):
 
 class ProfilingSnitchClusterTilingSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn):
     _printCycleDifference = NodeTemplate(r"""
-    printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \
-    ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr});
+    printf("%s%u][Core %d] %s%6u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \
+    ${measurement}, ${suffixStr});
+    """)
+
+    _printCycleContribution = NodeTemplate(r"""
+    uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput};
+    uint32_t dma = ${measurementInput} + ${measurementOutput};
+    float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total;
+    float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total;
+    printf("%s%u][Core %d] Total      :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\n", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), total, kernel_percentage, overhead_percentage, ${measurementKernel}, dma);
     """)
 
 
 class ProfilingSnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn):
     _printCycleDifference = NodeTemplate(r"""
-    printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \
-    ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr});
+    printf("%s%u][Core %d] %s%6u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \
+    ${measurement}, ${suffixStr});
+    """)
+
+    _printCycleContribution = NodeTemplate(r"""
+    uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput};
+    uint32_t dma = ${measurementInput} + ${measurementOutput};
+    float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total;
+    float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total;
+    printf("%s%u][Core %d] Total      :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\n", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), total, kernel_percentage, overhead_percentage, ${measurementKernel}, dma);
     """)
 
 
diff --git a/Deeploy/Targets/Snitch/Parsers.py b/Deeploy/Targets/Snitch/Parsers.py
index 0051994686..6976d8d356 100644
--- a/Deeploy/Targets/Snitch/Parsers.py
+++ b/Deeploy/Targets/Snitch/Parsers.py
@@ -4,10 +4,11 @@
 
 from typing import Tuple
 
+import numpy as np
 import onnx_graphsurgeon as gs
 
-from Deeploy.DeeployTypes import NetworkContext
-from Deeploy.Targets.Generic.Parsers import GEMMParser, RQGEMMParser
+from Deeploy.DeeployTypes import NetworkContext, NodeParser
+from Deeploy.Targets.Generic.Parsers import AddParser, DivParser, GEMMParser, MulParser, RQGEMMParser
 
 
 class SnitchGEMMParser(GEMMParser):
@@ -72,3 +73,262 @@ def parseNodeCtxt(self,
             return ctxt, False
 
         return newCtxt, True
+
+
+class SnitchRMSNormParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if node.op != 'RMSNorm':
+            return False
+        if len(node.inputs) != 2 or len(node.outputs) != 1:
+            return False
+        eps = node.attrs.get('eps', node.attrs.get('epsilon', 1e-6))
+        self.operatorRepresentation['eps'] = f"{float(eps):.10e}f"
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        data_in = ctxt.lookup(node.inputs[0].name)
+        weight = ctxt.lookup(node.inputs[1].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['weight'] = weight.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['input_shape'] = list(data_in.shape)
+        self.operatorRepresentation['weight_shape'] = list(weight.shape)
+        self.operatorRepresentation['output_shape'] = list(data_out.shape)
+        self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        self.operatorRepresentation['lastDimLength'] = int(data_in.shape[-1])
+        self.operatorRepresentation['input_ndim'] = len(data_in.shape)
+        self.operatorRepresentation['weight_ndim'] = len(weight.shape)
+
+        return ctxt, True
+
+
+class HardSwishParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        """Parse HardSwish node."""
+
+        if node.op != 'HardSwish':
+            return False
+
+        # Check basic structure: 1 input and 1 output
+        if len(node.inputs) != 1 or len(node.outputs) != 1:
+            return False
+
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        """Parse HardSwish node with network context."""
+
+        # Get input and output buffers
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        # Store buffer names
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+
+        # Calculate size for memory allocation
+        self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
+
+        return ctxt, True
+
+
+class SnitchAddParser(AddParser):
+    """
+    Inherits from GenericAddParser and adds support for Broadcasting.
+
+    Compatibility:
+    - No broadcasting: Uses the Add_fp32() fast path.
+    - With broadcasting: Uses the Add_fp32_broadcast() generic version.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        # Call parent method to retrieve basic information
+        ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        if not ret:
+            return ctxt, False
+
+        # Retrieve shape information
+        data_in_1 = ctxt.lookup(node.inputs[0].name)
+        data_in_2 = ctxt.lookup(node.inputs[1].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        shape1 = list(data_in_1.shape)
+        shape2 = list(data_in_2.shape)
+        out_shape = list(data_out.shape)
+
+        # Correct 'size' to match the output shape (after broadcasting)
+        self.operatorRepresentation['size'] = int(np.prod(out_shape))
+
+        # Broadcasting information
+        self.operatorRepresentation['shape1'] = shape1
+        self.operatorRepresentation['shape2'] = shape2
+        self.operatorRepresentation['out_shape'] = out_shape
+        self.operatorRepresentation['ndim'] = len(out_shape)
+
+        # Determine if broadcasting is needed
+        need_broadcast = (shape1 != shape2)
+        self.operatorRepresentation['need_broadcast'] = need_broadcast
+
+        if need_broadcast:
+            strides1, strides2 = self._compute_broadcast_strides(shape1, shape2, out_shape)
+            self.operatorRepresentation['strides1'] = strides1
+            self.operatorRepresentation['strides2'] = strides2
+
+        return ctxt, True
+
+    def _compute_broadcast_strides(self, shape1, shape2, out_shape):
+        """
+        Calculates strides after broadcasting (following ONNX/NumPy rules).
+
+        Principles:
+        - Align dimensions from right to left.
+        - When a dimension is 1, set stride to 0 to achieve the broadcasting effect.
+
+        Example:
+        shape1=[8,8,8], shape2=[8]
+        → pad2=[1,1,8]
+        → strides1=[64,8,1], strides2=[0,0,1]
+        """
+        ndim = len(out_shape)
+
+        # Right-align and pad to the same number of dimensions
+        pad1 = [1] * (ndim - len(shape1)) + shape1
+        pad2 = [1] * (ndim - len(shape2)) + shape2
+
+        def calc_strides(padded_shape, out_shape):
+            strides = []
+            stride = 1
+            for i in range(ndim - 1, -1, -1):
+                if padded_shape[i] == 1 and out_shape[i] > 1:
+                    strides.insert(0, 0)  # Broadcast dimension stride=0
+                else:
+                    strides.insert(0, stride)
+
+                # Update stride multiplier only if the current dimension is essentially used (size > 1)
+                stride *= padded_shape[i] if padded_shape[i] > 1 else 1
+            return strides
+
+        strides1 = calc_strides(pad1, out_shape)
+        strides2 = calc_strides(pad2, out_shape)
+
+        return strides1, strides2
+
+
+class SnitchDivParser(DivParser):
+    """
+    Snitch-specific Div Parser.
+    Inherits from Generic DivParser and adds shape/broadcasting information.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        """
+        Extend Generic parser to add shape and broadcasting information.
+        """
+        # Call parent method first
+        ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        # Get shape information
+        data_in_1 = ctxt.lookup(node.inputs[0].name)
+        data_in_2 = ctxt.lookup(node.inputs[1].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        shape1 = list(data_in_1.shape)
+        shape2 = list(data_in_2.shape)
+        out_shape = list(data_out.shape)
+
+        # Store shape information
+        self.operatorRepresentation['shape1'] = shape1
+        self.operatorRepresentation['shape2'] = shape2
+        self.operatorRepresentation['out_shape'] = out_shape
+
+        # Calculate sizes
+        self.operatorRepresentation['size1'] = int(np.prod(shape1))
+        self.operatorRepresentation['size2'] = int(np.prod(shape2))
+
+        # Update output size (may differ due to broadcasting)
+        self.operatorRepresentation['size'] = int(np.prod(out_shape))
+
+        # Check if scalar broadcasting (input2 is scalar)
+        self.operatorRepresentation['is_scalar'] = (self.operatorRepresentation['size2'] == 1)
+
+        return ctxt, True
+
+
+class SnitchMulParser(MulParser):
+    """
+    Snitch-specific Mul Parser.
+    Inherits from Generic MulParser and adds shape/broadcasting information.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        """
+        Extend Generic parser to add shape and broadcasting information.
+        """
+        # Call parent method first
+        ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if not ret:
+            return ctxt, False
+
+        # Get shape information
+        data_in_1 = ctxt.lookup(node.inputs[0].name)
+        data_in_2 = ctxt.lookup(node.inputs[1].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        shape1 = list(data_in_1.shape)
+        shape2 = list(data_in_2.shape)
+        out_shape = list(data_out.shape)
+
+        # Store shape information
+        self.operatorRepresentation['shape1'] = shape1
+        self.operatorRepresentation['shape2'] = shape2
+        self.operatorRepresentation['out_shape'] = out_shape
+
+        # Calculate sizes
+        self.operatorRepresentation['size1'] = int(np.prod(shape1))
+        self.operatorRepresentation['size2'] = int(np.prod(shape2))
+
+        # Update output size (may differ due to broadcasting)
+        self.operatorRepresentation['size'] = int(np.prod(out_shape))
+
+        # Check if scalar broadcasting (input2 is scalar)
+        self.operatorRepresentation['is_scalar'] = (self.operatorRepresentation['size2'] == 1)
+
+        return ctxt, True
diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py
index d62d1c3802..32bf53190f 100644
--- a/Deeploy/Targets/Snitch/Platform.py
+++ b/Deeploy/Targets/Snitch/Platform.py
@@ -2,46 +2,69 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
+from typing import List, Type
 
 import numpy as np
 
+from Deeploy.AbstractDataTypes import Pointer, PointerClass, VoidType
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
-from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicLayerNormBindings, BasicMatMulBindings, \
-    BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding
-from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \
-    ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer
-from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \
-    RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser
+from Deeploy.Targets.Generic.Bindings import BasicConcatBindings, BasicGatherBindings, BasicLayerNormBindings, \
+    BasicMatMulBindings, BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding, \
+    BasicTransposeBindings
+from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, DivLayer, GatherLayer, GEMMLayer, HardSwishLayer, \
+    LayerNormLayer, MatMulLayer, MulLayer, PadLayer, ReshapeLayer, RMSNormLayer, RQGEMMLayer, RQIntegerDivLayer, \
+    SoftmaxLayer, TransposeLayer, iNoNormLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \
+    ReshapeParser, RQAddParser, RQIntegerDivParser, SoftmaxParser, TransposeParser, UnsqueezeParser, iLayerNormParser, \
+    iNoNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \
     IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \
     SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
 from Deeploy.Targets.PULPOpen.Platform import RQAddMapper
-from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser
+from Deeploy.Targets.Snitch.Bindings import BasicDivBindings, BasicHardSwishBindings, BasicMulBindings, \
+    BasicRMSNormBindings, SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, SnitchiSoftmaxBindings, \
+    SnitchRQAddBindings, SnitchRqGemmBindings
+from Deeploy.Targets.Snitch.Parsers import HardSwishParser, SnitchDivParser, SnitchGEMMParser, SnitchMulParser, \
+    SnitchRMSNormParser, SnitchRQGEMMParser
 from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate
-from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchGemmTilingReadyBindings, \
-    SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, SnitchRQAddTilingReadyBindings, \
-    SnitchRqGemmTilingReadyBindings
 
+# =============================================================================
+# Mappers for UNTILED mode (using BasicBindings with BasicTransformer)
+# These are used by generateNetwork.py (testRunner_snitch.py)
+# =============================================================================
 GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)
 Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
 Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
 UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
+ReshapeMapper = NodeMapper(ReshapeParser(), BasicReshapeBindings)
+TransposeMapper = NodeMapper(TransposeParser(), BasicTransposeBindings)
+ConcatMapper = NodeMapper(ConcatParser(), BasicConcatBindings)
 
 RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
 
-MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings)
-GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings)
-RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings)
-iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
-SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
-iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings)
+# These use TiledTransformer but work in both modes (original upstream behavior)
+GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmBindings)
+RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmBindings)
+iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxBindings)
+SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxBindings)
+iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormBindings)
 iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings)
-RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings)
-AddMapper = NodeMapper(AddParser(), SnitchAddTileReadyBindings)
+RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddBindings)
+AddMapper = NodeMapper(AddParser(), SnitchAddBindings)
+
+# New operators for microLlama - using BasicBindings for untiled mode
+RMSNormMapper = NodeMapper(SnitchRMSNormParser(), BasicRMSNormBindings)
+HardSwishMapper = NodeMapper(HardSwishParser(), BasicHardSwishBindings)
+MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings)
+DivMapper = NodeMapper(SnitchDivParser(), BasicDivBindings)
+MulMapper = NodeMapper(SnitchMulParser(), BasicMulBindings)
 
+# =============================================================================
+# SnitchMapping - for UNTILED mode (generateNetwork.py)
+# Uses BasicBindings for new operators, TiledTransformer bindings for original ops
+# =============================================================================
 SnitchMapping = {
     'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
     'Gather': GatherLayer([GatherMapper]),
@@ -56,6 +79,72 @@
     'iLayerNorm': LayerNormLayer([iLayerNormMapper]),
     'RequantizedAdd': AddLayer([RQAddMapper]),
     'Add': AddLayer([AddMapper]),
+    'RMSNorm': RMSNormLayer([RMSNormMapper]),
+    'HardSwish': HardSwishLayer([HardSwishMapper]),
+    'Div': DivLayer([DivMapper]),
+    'Mul': MulLayer([MulMapper]),
+    'Reshape': ReshapeLayer([ReshapeMapper]),
+    'Transpose': TransposeLayer([TransposeMapper]),
+    'Concat': ConcatLayer([ConcatMapper]),
+}
+
+# =============================================================================
+# Import TilingReadyBindings for TILED mode (testMVP.py)
+# These will be used by TilerDeployerWrapper
+# =============================================================================
+from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchConcatTilingReadyBindings, \
+    SnitchDivTilingReadyBindings, SnitchGatherTilingReadyBindings, SnitchGemmTilingReadyBindings, \
+    SnitchHardSwishTilingReadyBindings, SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, \
+    SnitchMatMulTilingReadyBindings, SnitchMulTilingReadyBindings, SnitchReshapeTilingReadyBindings, \
+    SnitchRMSNormTilingReadyBindings, SnitchRQAddTilingReadyBindings, SnitchRqGemmTilingReadyBindings, \
+    SnitchTransposeTilingReadyBindings
+
+# =============================================================================
+# Tiled Mappers - for TILED mode (testMVP.py via TilerDeployerWrapper)
+# =============================================================================
+TiledGatherMapper = NodeMapper(GatherParser(), SnitchGatherTilingReadyBindings)
+TiledUnsqueezeMapper = NodeMapper(UnsqueezeParser(), SnitchReshapeTilingReadyBindings)
+TiledReshapeMapper = NodeMapper(ReshapeParser(), SnitchReshapeTilingReadyBindings)
+TiledTransposeMapper = NodeMapper(TransposeParser(), SnitchTransposeTilingReadyBindings)
+TiledConcatMapper = NodeMapper(ConcatParser(), SnitchConcatTilingReadyBindings)
+TiledMatMulMapper = NodeMapper(MatMulParser(), SnitchMatMulTilingReadyBindings)
+TiledRMSNormMapper = NodeMapper(SnitchRMSNormParser(), SnitchRMSNormTilingReadyBindings)
+TiledHardSwishMapper = NodeMapper(HardSwishParser(), SnitchHardSwishTilingReadyBindings)
+TiledDivMapper = NodeMapper(SnitchDivParser(), SnitchDivTilingReadyBindings)
+TiledMulMapper = NodeMapper(SnitchMulParser(), SnitchMulTilingReadyBindings)
+TiledGemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings)
+TiledRqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings)
+TilediSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
+TiledSoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings)
+TilediNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings)
+TiledRQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings)
+TiledAddMapper = NodeMapper(AddParser(), SnitchAddTileReadyBindings)
+
+# =============================================================================
+# SnitchTiledMapping - for TILED mode (testMVP.py)
+# Uses TilingReadyBindings for all operators
+# =============================================================================
+SnitchTiledMapping = {
+    'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
+    'Gather': GatherLayer([TiledGatherMapper]),
+    'Pad': PadLayer([Pad1DMapper, Pad2DMapper]),
+    'Unsqueeze': ReshapeLayer([TiledUnsqueezeMapper]),
+    'MatMul': MatMulLayer([TiledMatMulMapper]),
+    'Gemm': GEMMLayer([TiledGemmMapper]),
+    'RQGemm': RQGEMMLayer([TiledRqGemmMapper]),
+    'iSoftmax': SoftmaxLayer([TilediSoftmaxMapper]),
+    'Softmax': SoftmaxLayer([TiledSoftmaxMapper]),
+    'iNoNorm': iNoNormLayer([TilediNoNormMapper]),
+    'iLayerNorm': LayerNormLayer([iLayerNormMapper]),
+    'RequantizedAdd': AddLayer([TiledRQAddMapper]),
+    'Add': AddLayer([TiledAddMapper]),
+    'RMSNorm': RMSNormLayer([TiledRMSNormMapper]),
+    'HardSwish': HardSwishLayer([TiledHardSwishMapper]),
+    'Div': DivLayer([TiledDivMapper]),
+    'Mul': MulLayer([TiledMulMapper]),
+    'Reshape': ReshapeLayer([TiledReshapeMapper]),
+    'Transpose': TransposeLayer([TiledTransposeMapper]),
+    'Concat': ConcatLayer([TiledConcatMapper]),
 }
 
 
@@ -105,6 +194,12 @@ class SnitchConstantBuffer(ConstantBuffer):
     allocTemplate = AllocateTemplate.snitchL2GlobalAllocateTemplate
     deallocTemplate = FreeTemplate.snitchL2GlobalTemplate
 
+    def __init__(self, name: str = '', shape = [1], values = [0]):
+        super().__init__(name, shape, values)
+        # Initialize _type with a default value to prevent AttributeError
+        # The actual type will be set later via annotateType
+        self._type: Type[Pointer] = PointerClass(VoidType)
+
     def _bufferRepresentation(self):
         operatorRepresentation = super()._bufferRepresentation()
 
@@ -163,3 +258,21 @@ def __init__(self,
                  transientBuffer = SnitchTransientBuffer,
                  includeList: List[str] = _includeList):
         super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+
+class SnitchTiledClusterEngine(DeploymentEngine):
+
+    def __init__(self, name: str, Mapping = SnitchTiledMapping, initCode = "", includeList = _includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
+class SnitchTiledPlatform(DeploymentPlatform):
+
+    def __init__(self,
+                 engines = [SnitchTiledClusterEngine("SnitchCluster")],
+                 variableBuffer = SnitchVariableBuffer,
+                 constantBuffer = SnitchConstantBuffer,
+                 structBuffer = SnitchStructBuffer,
+                 transientBuffer = SnitchTransientBuffer,
+                 includeList: List[str] = _includeList):
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
diff --git a/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py
new file mode 100644
index 0000000000..5c5675c58f
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+# Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatAddTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Always initialize these variables to avoid Mako errors
+        operatorRepresentation.setdefault('need_broadcast', False)
+        operatorRepresentation.setdefault('ndim', 0)
+        operatorRepresentation.setdefault('strides1_str', '{}')
+        operatorRepresentation.setdefault('strides2_str', '{}')
+        operatorRepresentation.setdefault('out_shape_str', '{}')
+
+        # If broadcasting is required, generate the stride array strings
+        if operatorRepresentation['need_broadcast']:
+            strides1 = operatorRepresentation['strides1']
+            strides2 = operatorRepresentation['strides2']
+            out_shape = operatorRepresentation['out_shape']
+            operatorRepresentation['strides1_str'] = '{' + ', '.join(map(str, strides1)) + '}'
+            operatorRepresentation['strides2_str'] = '{' + ', '.join(map(str, strides2)) + '}'
+            operatorRepresentation['out_shape_str'] = '{' + ', '.join(map(str, out_shape)) + '}'
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _FloatAddTemplate("""
+// Snitch FP32 Add (Name: ${nodeName}, Op: ${nodeOp})
+% if need_broadcast:
+{
+    uint32_t strides1[${ndim}] = ${strides1_str};
+    uint32_t strides2[${ndim}] = ${strides2_str};
+    uint32_t out_shape[${ndim}] = ${out_shape_str};
+    Add_fp32_broadcast(${data_in_1}, ${data_in_2}, ${data_out}, out_shape, strides1, strides2, ${ndim}, ${size});
+}
+% else:
+Add_fp32(${data_in_1}, ${data_in_2}, ${data_out}, ${size});
+% endif
+""")
diff --git a/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py
new file mode 100644
index 0000000000..ee35255e24
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py
@@ -0,0 +1,49 @@
+# ~/Deeploy/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py
+
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class FloatDivTemplate(NodeTemplate):
+    """Template for FP32 Div operation with dynamic template selection."""
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # Check if scalar broadcasting
+        is_scalar = operatorRepresentation.get('is_scalar', False)
+
+        # Dynamically select template based on is_scalar flag
+        if is_scalar:
+            # Use scalar broadcasting version
+            self.templateStr = FloatDivScalarTemplateStr
+        else:
+            # Use element-wise version
+            self.templateStr = FloatDivTemplateStr
+
+        return ctxt, operatorRepresentation, []
+
+
+# Template for element-wise division
+FloatDivTemplateStr = r"""
+Div_fp32(${input1}, ${input2}, ${output}, ${size});
+"""
+
+# Template for scalar broadcasting (optimized)
+FloatDivScalarTemplateStr = r"""
+{
+    float32_t scalar = ${input2}[0];
+    Div_fp32_scalar(${input1}, scalar, ${output}, ${size});
+}
+"""
+
+# Create reference template with default (element-wise)
+referenceTemplate = FloatDivTemplate(FloatDivTemplateStr)
diff --git a/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py
new file mode 100644
index 0000000000..1615282437
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class FloatHardSwishTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation["data_in"])
+        operatorRepresentation["size"] = int(np.prod(data_in.shape))
+
+        return ctxt, operatorRepresentation, []
+
+
+FloatHardSwishTemplateStr = r"""
+HardSwish_fp32(${data_in}, ${data_out}, ${size});
+"""
+
+referenceTemplate = FloatHardSwishTemplate(FloatHardSwishTemplateStr)
diff --git a/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py
new file mode 100644
index 0000000000..0cd0a649e1
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency
+referenceTemplate = NodeTemplate("""
+// Matmul (Name: ${nodeName}, Op: ${nodeOp})
+if (snrt_cluster_core_idx() == 0) {
+    ${A_type.typeName} ref_${data_out}_${A} = ${A};
+    ${B_type.typeName} ref_${data_out}_${B} = ${B};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for(uint32_t i=0; i<${batch}; i++){
+        MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+            ref_${data_out}_${A},
+            ref_${data_out}_${B},
+            ref_${data_out}_${data_out},
+            ${M},
+            ${N},
+            ${O}
+        );
+
+        ref_${data_out}_${A} += ${M} * ${N};
+        ref_${data_out}_${B} += ${N} * ${O};
+        ref_${data_out}_${data_out} += ${M} * ${O};
+    }
+}
+""")
diff --git a/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py
new file mode 100644
index 0000000000..7a970e6411
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class FloatMulTemplate(NodeTemplate):
+    """Template for FP32 Mul operation with dynamic template selection."""
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # Check if scalar broadcasting
+        is_scalar = operatorRepresentation.get('is_scalar', False)
+
+        # Dynamically select template based on is_scalar flag
+        if is_scalar:
+            # Use scalar broadcasting version
+            self.templateStr = FloatMulScalarTemplateStr
+        else:
+            # Use element-wise version
+            self.templateStr = FloatMulTemplateStr
+
+        return ctxt, operatorRepresentation, []
+
+
+# Template for element-wise multiplication
+# Note: MulParser uses A, B, C for input1, input2, output respectively
+FloatMulTemplateStr = r"""
+Mul_fp32(${A}, ${B}, ${C}, ${size});
+"""
+
+# Template for scalar broadcasting (optimized)
+FloatMulScalarTemplateStr = r"""
+{
+    float32_t scalar = ${B}[0];
+    Mul_fp32_scalar(${A}, scalar, ${C}, ${size});
+}
+"""
+
+# Create reference template with default (element-wise)
+referenceTemplate = FloatMulTemplate(FloatMulTemplateStr)
diff --git a/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py
new file mode 100644
index 0000000000..8ae4d95e01
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class FloatRMSNormTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation["data_in"])
+        operatorRepresentation["lastDimLength"] = data_in.shape[-1]
+        operatorRepresentation["size"] = int(np.prod(data_in.shape))
+
+        return ctxt, operatorRepresentation, []
+
+
+FloatRMSNormTemplateStr = r"""
+RMSNorm_fp32(${data_in}, ${weight}, ${data_out}, ${size}, ${lastDimLength}, ${eps});
+"""
+
+referenceTemplate = FloatRMSNormTemplate(FloatRMSNormTemplateStr)
diff --git a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
index 216ff35b9a..a8f32b32e3 100644
--- a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
@@ -25,8 +25,8 @@ def alignToContext(self, ctxt: NetworkContext,
 
 
 FloatSoftmaxTemplateStr = r"""
-    uint32_t batch_size = ${size} / ${lastDimLength};
-    uint32_t compute_num = 1; //snrt_cluster_compute_core_num();
+    int32_t batch_size = ${size} / ${lastDimLength};
+    int32_t compute_num = 1; //snrt_cluster_compute_core_num();
     int32_t ldI = compute_num * ${input_samples};
     int32_t batch_offset = ${seq_len} * ${input_samples};
 
diff --git a/Deeploy/Targets/Snitch/Templates/GatherTemplate.py b/Deeploy/Targets/Snitch/Templates/GatherTemplate.py
new file mode 100644
index 0000000000..fa4f6a2a86
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/GatherTemplate.py
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency
+referenceTemplate = NodeTemplate("""
+// Gather (Name: ${nodeName}, Op: ${nodeOp})
+<%
+width = int(data_in_type.referencedType.typeWidth/8)
+%>
+if (snrt_cluster_core_idx() == 0) {
+for (uint32_t i=0; i<${batch}; ++i) {
+    memcpy(${data_out} + i * ${axis_length}, ${data_in} + i * ${batch_length} + ${index} * ${axis_length}, ${axis_length} * ${width});
+}
+}
+""")
diff --git a/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py
new file mode 100644
index 0000000000..bce916ea60
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/MatMulTemplate.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _MatMulTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['data_out'])
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+        if hasattr(A, "_signed") and hasattr(A, "nLevels"):
+            operatorRepresentation['A_offset'] = (A._signed == 0) * int(A.nLevels / 2)
+        if hasattr(B, "_signed") and hasattr(B, "nLevels"):
+            operatorRepresentation['B_offset'] = (B._signed == 0) * int(B.nLevels / 2)
+        if hasattr(C, "_signed") and hasattr(C, "nLevels"):
+            operatorRepresentation['C_offset'] = -(C._signed == 0) * int(C.nLevels / 2)
+
+        return ctxt, operatorRepresentation, []
+
+
+# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency
+referenceTemplate = _MatMulTemplate("""
+// MatMul (Name: ${nodeName}, Op: ${nodeOp})
+if (snrt_cluster_core_idx() == 0) {
+    ${A_type.typeName} ref_${data_out}_${A} = ${A};
+    ${B_type.typeName} ref_${data_out}_${B} = ${B};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for(uint32_t i=0;i<${batch};i++){
+        MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(
+            ref_${data_out}_${A},
+            ref_${data_out}_${B},
+            ref_${data_out}_${data_out},
+            ${M},
+            ${N},
+            ${O},
+            ${A_offset}, ${B_offset}, ${C_offset}
+        );
+
+        ref_${data_out}_${A} += ${M} * ${N};
+        ref_${data_out}_${B} += ${N} * ${O};
+        ref_${data_out}_${data_out} += ${M} * ${O};
+    }
+}
+""")
diff --git a/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py
new file mode 100644
index 0000000000..a99573b27b
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+
+
+class _SnitchReshapeTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        # SCHEREMO: Selectively mark 'indices' dead, since we don't need them
+        if 'indices' in operatorRepresentation.keys():
+            ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False
+            ctxt.globalObjects[operatorRepresentation['indices']]._live = False
+
+        # Same for "shape"
+        if "shape" in operatorRepresentation.keys():
+            ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False
+            ctxt.globalObjects[operatorRepresentation["shape"]]._live = False
+
+        bufferIn = ctxt.lookup(operatorRepresentation['data_in'])
+        assert isinstance(bufferIn, VariableBuffer)
+        bufferOut = ctxt.lookup(operatorRepresentation['data_out'])
+        assert isinstance(bufferOut, VariableBuffer)
+
+        # Link aliases to each buffer
+        bufferIn.aliases.add(bufferOut.name)
+        bufferOut.aliases.add(bufferIn.name)
+
+        return ctxt, operatorRepresentation, []
+
+
+# Use snrt_cluster_core_idx() == 0 instead of SINGLE_CORE macro to avoid core_id dependency
+referenceTemplate = _SnitchReshapeTemplate("""
+// Reshape (Name: ${nodeName}, Op: ${nodeOp})
+if (snrt_cluster_core_idx() == 0) { ${data_out} = ${data_in}; }
+""")
diff --git a/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py
new file mode 100644
index 0000000000..5e33f85aa0
--- /dev/null
+++ b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+# Use snrt_cluster_core_idx() == 0 instead of BEGIN_SINGLE_CORE macro to avoid core_id dependency
+referenceTemplate = NodeTemplate("""
+// Transpose ${data_in_shape} -> ${data_out_shape} (Name: ${nodeName}, Op: ${nodeOp})
+if (snrt_cluster_core_idx() == 0) {
+${data_out_type.typeName} dummy_${data_out} = ${data_out};
+<%
+    dimStr = ''
+    accessStr = ''
+    shapeStr = ''
+    for dim in data_in_shape:
+        dimStr += '['+str(dim)+']'
+%>
+% for idx, i in enumerate(perm[:-1]):
+<%
+    shapeStr += '['+str(data_in_shape[idx+1])+']'
+%>
+% endfor
+% for idx, i in enumerate(perm):
+<%
+    shape = data_out_shape[idx]
+    accessStr += '[i_'+str(idx)+']'
+%>
+for(uint32_t i_${i} = 0; i_${i}<${shape}; i_${i}++){
+% endfor
+*dummy_${data_out}++ = ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr};
+% for idx, i in enumerate(perm):
+}
+% endfor
+}
+""")
diff --git a/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py
new file mode 100644
index 0000000000..b9b07be30a
--- /dev/null
+++ b/Deeploy/Targets/Snitch/TileConstraints/FloatDivTileConstraint.py
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class FloatDivTileConstraint(TileConstraint):
+    """Tile constraint for FP32 Div operation supporting scalar broadcasting."""
+
+    dataIn1Name = "input1"
+    dataIn2Name = "input2"
+    dataOutName = "output"
+
+    @classmethod
+    def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBuffer1Name = parseDict[cls.dataIn1Name]
+        inputBuffer2Name = parseDict[cls.dataIn2Name]
+        outputBufferName = parseDict[cls.dataOutName]
+
+        input1Shape = ctxt.lookup(inputBuffer1Name).shape
+        input2Shape = ctxt.lookup(inputBuffer2Name).shape
+
+        # Add tensor dimensions to model
+        tilerModel.addTensorDimToModel(ctxt, inputBuffer1Name)
+        tilerModel.addTensorDimToModel(ctxt, outputBufferName)
+
+        # Check if input2 is scalar (total size == 1)
+        is_scalar = np.prod(input2Shape) == 1
+
+        if is_scalar:
+            # Scalar broadcasting: input2 is a scalar, don't tile it
+            # Only add input2 dimensions if it has more than 0 dims
+            if len(input2Shape) > 0:
+                tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name)
+                # Constrain scalar to remain untiled (size 1)
+                for dim in range(len(input2Shape)):
+                    input2DimVar = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+                    tilerModel.addConstraint(input2DimVar == input2Shape[dim])
+
+            # Input1 and output must have same dimensions
+            for dim in range(len(input1Shape)):
+                inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+                outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+                tilerModel.addConstraint(inputDim1Var == outputDimVar)
+        else:
+            # Element-wise: both inputs must have same shape
+            tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name)
+
+            for dim in range(len(input1Shape)):
+                inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+                inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+                outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+                tilerModel.addConstraint(inputDim1Var == inputDim2Var)
+                tilerModel.addConstraint(inputDim1Var == outputDimVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName]
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"size": []}
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        # Check if scalar broadcasting
+        input2Name = operatorRepresentation[cls.dataIn2Name]
+        input2Shape = ctxt.lookup(input2Name).shape
+        is_scalar = np.prod(input2Shape) == 1
+
+        for cube in outputCubes:
+            newSize = np.prod(cube.dims)
+            replacements["size"].append(newSize)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+            if is_scalar:
+                # For scalar, load the entire scalar tensor (size 1)
+                scalarCube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape))
+                inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: scalarCube})
+            else:
+                inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({cls.dataOutName: out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py
new file mode 100644
index 0000000000..99df639004
--- /dev/null
+++ b/Deeploy/Targets/Snitch/TileConstraints/FloatMulTileConstraint.py
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class FloatMulTileConstraint(TileConstraint):
+    """Tile constraint for FP32 Mul operation supporting scalar broadcasting."""
+
+    dataIn1Name = "A"
+    dataIn2Name = "B"
+    dataOutName = "C"
+
+    @classmethod
+    def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBuffer1Name = parseDict[cls.dataIn1Name]
+        inputBuffer2Name = parseDict[cls.dataIn2Name]
+        outputBufferName = parseDict[cls.dataOutName]
+
+        input1Shape = ctxt.lookup(inputBuffer1Name).shape
+        input2Shape = ctxt.lookup(inputBuffer2Name).shape
+
+        # Add tensor dimensions to model
+        tilerModel.addTensorDimToModel(ctxt, inputBuffer1Name)
+        tilerModel.addTensorDimToModel(ctxt, outputBufferName)
+
+        # Check if input2 is scalar (total size == 1)
+        is_scalar = np.prod(input2Shape) == 1
+
+        if is_scalar:
+            # Scalar broadcasting: input2 is a scalar, don't tile it
+            # Only add input2 dimensions if it has more than 0 dims
+            if len(input2Shape) > 0:
+                tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name)
+                # Constrain scalar to remain untiled (size 1)
+                for dim in range(len(input2Shape)):
+                    input2DimVar = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+                    tilerModel.addConstraint(input2DimVar == input2Shape[dim])
+
+            # Input1 and output must have same dimensions
+            for dim in range(len(input1Shape)):
+                inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+                outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+                tilerModel.addConstraint(inputDim1Var == outputDimVar)
+        else:
+            # Element-wise: both inputs must have same shape
+            tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name)
+
+            for dim in range(len(input1Shape)):
+                inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim)
+                inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim)
+                outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim)
+
+                tilerModel.addConstraint(inputDim1Var == inputDim2Var)
+                tilerModel.addConstraint(inputDim1Var == outputDimVar)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName]
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"size": []}
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        # Check if scalar broadcasting
+        input2Name = operatorRepresentation[cls.dataIn2Name]
+        input2Shape = ctxt.lookup(input2Name).shape
+        is_scalar = np.prod(input2Shape) == 1
+
+        for cube in outputCubes:
+            newSize = np.prod(cube.dims)
+            replacements["size"].append(newSize)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+            if is_scalar:
+                # For scalar, load the entire scalar tensor (size 1)
+                scalarCube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape))
+                inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: scalarCube})
+            else:
+                inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({cls.dataOutName: out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py
new file mode 100644
index 0000000000..1bafa36e3b
--- /dev/null
+++ b/Deeploy/Targets/Snitch/TileConstraints/ReshapeTileConstraint.py
@@ -0,0 +1,143 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class ReshapeTileConstraint(TileConstraint):
+    """Tile constraint for Reshape operation - a NOP that just reinterprets data layout."""
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        pointer: List[str] = []
+
+        for key, value in parseDict.items():
+            if not isinstance(value, str):
+                continue
+
+            if ctxt.is_global(value) or ctxt.is_local(value):
+                pointer.append(value)
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, outputBufferName]:
+            _buffer = ctxt.lookup(bufferName)
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+            for idx, shapeDim in enumerate(_buffer.shape):
+                tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = bufferName, dimIdx = idx) <= shapeDim)
+
+        # Constrain total elements to be equal
+        inputBuffer = ctxt.lookup(inputBufferName)
+        outputBuffer = ctxt.lookup(outputBufferName)
+
+        # For reshape, we want the tiles to have the same total number of elements
+        # This is automatically satisfied if we tile based on output and compute input from that
+
+        # Remove unused tensors from deployment
+        for bufferName in pointer:
+            if bufferName not in [inputBufferName, outputBufferName]:
+                ctxt.lookup(bufferName)._deploy = False
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # For reshape, input and output have the same data, just different interpretations
+        # We need to compute the corresponding input cube for each output cube
+        inputName = operatorRepresentation['data_in']
+        outputName = operatorRepresentation['data_out']
+        inputShape = ctxt.lookup(inputName).shape
+        outputShape = ctxt.lookup(outputName).shape
+
+        replacements = {"size": []}
+        replacementTypes = {"size": PointerClass(uint16_t)}
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for cube in outputCubes:
+            # Calculate the flat offset and size for the output cube
+            outSize = np.prod(cube.dims)
+            replacements["size"].append(outSize)
+
+            # For reshape, we need to map output cube to input cube
+            # Calculate flat index range for output cube
+            outOffset = 0
+            outStrides = []
+            stride = 1
+            for dim in reversed(outputShape):
+                outStrides.insert(0, stride)
+                stride *= dim
+
+            for i, (off, dim) in enumerate(zip(cube.offset, cube.dims)):
+                outOffset += off * outStrides[i]
+
+            # Convert flat offset to input coordinates
+            inStrides = []
+            stride = 1
+            for dim in reversed(inputShape):
+                inStrides.insert(0, stride)
+                stride *= dim
+
+            inOffset = []
+            remaining = outOffset
+            for i, stride in enumerate(inStrides):
+                inOffset.append(remaining // stride)
+                remaining = remaining % stride
+
+            # Calculate input cube dimensions
+            # For simplicity, treat as 1D cube in input space
+            inCubeDims = list(inputShape)
+            inCubeOffset = [0] * len(inputShape)
+
+            # Set the last dimension to the size, and offset based on flat index
+            totalSize = outSize
+            if len(inputShape) > 0:
+                # Compute proper input cube that covers the same elements
+                # Use a simple approach: linearize the input
+                inCubeOffset = list(inOffset)
+                inCubeDims = [1] * len(inputShape)
+                inCubeDims[-1] = min(totalSize, inputShape[-1] - inCubeOffset[-1])
+                remaining = totalSize - inCubeDims[-1]
+
+                for i in range(len(inputShape) - 2, -1, -1):
+                    if remaining <= 0:
+                        break
+                    inCubeDims[i] = min(remaining // np.prod(inputShape[i + 1:]) + 1, inputShape[i])
+                    remaining -= (inCubeDims[i] - 1) * np.prod(inputShape[i + 1:])
+
+            inputCube = HyperRectangle(tuple(inCubeOffset), tuple(inCubeDims))
+            inputLoadSchedule.append({"data_in": inputCube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Snitch/TileConstraints/__init__.py b/Deeploy/Targets/Snitch/TileConstraints/__init__.py
index 947a6fd82a..aece19d881 100644
--- a/Deeploy/Targets/Snitch/TileConstraints/__init__.py
+++ b/Deeploy/Targets/Snitch/TileConstraints/__init__.py
@@ -3,5 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from . import *
+from .FloatDivTileConstraint import *
+from .FloatMulTileConstraint import *
 from .iNoNormTileConstraint import *
 from .iSoftmaxTileConstraint import *
+from .ReshapeTileConstraint import *
diff --git a/Deeploy/Targets/Snitch/Tiler.py b/Deeploy/Targets/Snitch/Tiler.py
index 475a425779..5a5f4d0bf4 100644
--- a/Deeploy/Targets/Snitch/Tiler.py
+++ b/Deeploy/Targets/Snitch/Tiler.py
@@ -3,10 +3,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
-from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, \
-    SnitchiSoftmaxBindings, SnitchRQAddBindings, SnitchRqGemmBindings
+from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.iRMSNormTileConstraint import iRMSNormTileConstraint
+from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint
+from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchConcatBindings, SnitchDivBindings, \
+    SnitchGatherBindings, SnitchGemmBindings, SnitchHardSwishBindings, SnitchiNoNormBindings, SnitchiSoftmaxBindings, \
+    SnitchMatMulBindings, SnitchMulBindings, SnitchReshapeBindings, SnitchRMSNormBindings, SnitchRQAddBindings, \
+    SnitchRqGemmBindings, SnitchTransposeBindings
 from Deeploy.Targets.Snitch.TileConstraints import iNoNormTileConstraint, iSoftmaxTileConstraint
+from Deeploy.Targets.Snitch.TileConstraints.FloatDivTileConstraint import FloatDivTileConstraint
+from Deeploy.Targets.Snitch.TileConstraints.FloatMulTileConstraint import FloatMulTileConstraint
 from Deeploy.Targets.Snitch.TileConstraints.GemmTileConstraint import GemmTileConstraint
+from Deeploy.Targets.Snitch.TileConstraints.ReshapeTileConstraint import ReshapeTileConstraint
 from Deeploy.Targets.Snitch.TileConstraints.RqGemmTileConstraint import RqGemmTileConstraint
 from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
 
@@ -23,3 +34,30 @@
 
 SnitchAddTileReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchAddBindings,
                                                      tileConstraint = AddTileConstraint())
+
+SnitchRMSNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchRMSNormBindings,
+                                                           tileConstraint = iRMSNormTileConstraint())
+
+SnitchHardSwishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchHardSwishBindings,
+                                                             tileConstraint = iHardswishTileConstraint())
+
+SnitchDivTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchDivBindings,
+                                                       tileConstraint = FloatDivTileConstraint())
+
+SnitchMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMulBindings,
+                                                       tileConstraint = FloatMulTileConstraint())
+
+SnitchMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMatMulBindings,
+                                                          tileConstraint = MatMulTileConstraint())
+
+SnitchConcatTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchConcatBindings,
+                                                          tileConstraint = ConcatTileConstraint())
+
+SnitchTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchTransposeBindings,
+                                                             tileConstraint = TransposeTileConstraint())
+
+SnitchReshapeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchReshapeBindings,
+                                                           tileConstraint = ReshapeTileConstraint())
+
+SnitchGatherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchGatherBindings,
+                                                          tileConstraint = GatherTileConstraint())
diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz
new file mode 100644
index 0000000000..eec4cee600
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx
new file mode 100644
index 0000000000..7a146e5541
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx
@@ -0,0 +1,14 @@
+
+hardswish_test_fp32:
+*
+inputoutputHardSwish_node"	HardSwishhardswish_graph_fp32Z
+input
+
+
+
+�b
+output
+
+
+
+�B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz
new file mode 100644
index 0000000000..074c937f5b
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz
new file mode 100644
index 0000000000..9d14ca82f7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx
new file mode 100644
index 0000000000..25a7a9b683
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz
new file mode 100644
index 0000000000..6167f74042
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm_fused/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz
new file mode 100644
index 0000000000..d077979636
Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/activations.npz differ
diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz
new file mode 100644
index 0000000000..89c505c669
Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/network.onnx
new file mode 100644
index 0000000000..30b6d8420f
Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/network.onnx differ
diff --git a/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz
new file mode 100644
index 0000000000..07fda6854d
Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/microLlama_fp32_1/outputs.npz differ
diff --git a/DeeployTest/testRunner_tiled_snitch.py b/DeeployTest/testRunner_tiled_snitch.py
index 7787d1f844..cf6ac6b2e0 100644
--- a/DeeployTest/testRunner_tiled_snitch.py
+++ b/DeeployTest/testRunner_tiled_snitch.py
@@ -25,7 +25,10 @@
 
     args = parser.parse_args()
 
-    testRunner = TestRunner(platform = "Snitch", simulator = args.simulator, tiling = True, argument_parser = parser)
+    testRunner = TestRunner(platform = "Snitch_tiled",
+                            simulator = args.simulator,
+                            tiling = True,
+                            argument_parser = parser)
 
     testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
 
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 48c5777905..9d562cf577 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -24,12 +24,12 @@
 from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
 from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform
 from Deeploy.Targets.Snitch.Deployer import SnitchDeployer
-from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
+from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform, SnitchTiledPlatform
 from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera"]
+_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Snitch_tiled", "Chimera"]
 _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
 
 
@@ -65,6 +65,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
     elif platformName == "Snitch":
         Platform = SnitchPlatform()
 
+    elif platformName == "Snitch_tiled":
+        Platform = SnitchTiledPlatform()
+
     elif platformName == "SoftHier":
         Platform = SoftHierPlatform()
 
@@ -217,7 +220,7 @@ def mapDeployer(platform: DeploymentPlatform,
                                 default_channels_first = default_channels_first,
                                 deeployStateDir = deeployStateDir)
 
-    elif isinstance(platform, (SnitchPlatform)):
+    elif isinstance(platform, (SnitchPlatform, SnitchTiledPlatform)):
         if loweringOptimizer is None:
             loweringOptimizer = SnitchOptimizer
 
diff --git a/DeeployTest/testUtils/typeMapping.py b/DeeployTest/testUtils/typeMapping.py
index 232fd1e274..b6851dec7e 100644
--- a/DeeployTest/testUtils/typeMapping.py
+++ b/DeeployTest/testUtils/typeMapping.py
@@ -48,7 +48,11 @@ def inferMinimalType(values: np.ndarray, default: Type[BaseType] = int8_t) -> Ty
         print(f"Warning: Empty input array for type inference for {values}!")
         return default
 
-    if isInteger(values):
+    # First check the numpy dtype - if it's a float type, use float even if values are integer-like
+    # This handles cases like [0.0, 0.0] which would otherwise be incorrectly typed as uint8_t
+    if np.issubdtype(values.dtype, np.floating):
+        return minimalFloatType(values)
+    elif isInteger(values):
         return minimalIntegerType(values)
     else:
         return minimalFloatType(values)
diff --git a/TargetLibraries/Generic/inc/macros.h b/TargetLibraries/Generic/inc/macros.h
index d97cfecb7c..0b5a0e51fb 100644
--- a/TargetLibraries/Generic/inc/macros.h
+++ b/TargetLibraries/Generic/inc/macros.h
@@ -7,22 +7,28 @@
 #ifndef __DEEPLOY_BASIC_MATH_MACROS_HEADER_
 #define __DEEPLOY_BASIC_MATH_MACROS_HEADER_
 
+#ifndef MAX
 #define MAX(a, b)                                                              \
   ({                                                                           \
     __typeof__(a) _a = (a);                                                    \
     __typeof__(b) _b = (b);                                                    \
     _a > _b ? _a : _b;                                                         \
   })
+#endif
 
+#ifndef MIN
 #define MIN(a, b)                                                              \
   ({                                                                           \
     __typeof__(a) _a = (a);                                                    \
     __typeof__(b) _b = (b);                                                    \
     _a < _b ? _a : _b;                                                         \
   })
+#endif
 
+#ifndef CLAMP
 #define CLAMP(x, low, high)                                                    \
   (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
+#endif
 
 #define inf 1.0f / 0.0f
 
diff --git a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h
index e44d3c20c6..1305ba6bff 100644
--- a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h
+++ b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h
@@ -23,8 +23,13 @@
 
 #include "snrt.h"
 
+#include "kernel/Add.h"
+#include "kernel/Div.h"
 #include "kernel/Gemm.h"
+#include "kernel/HardSwish.h"
 #include "kernel/MatMul.h"
+#include "kernel/Mul.h"
+#include "kernel/RMSNrom.h"
 #include "kernel/RQGemm.h"
 #include "kernel/RQMatMul.h"
 #include "kernel/Softmax.h"
diff --git a/TargetLibraries/Snitch/inc/kernel/Add.h b/TargetLibraries/Snitch/inc/kernel/Add.h
new file mode 100644
index 0000000000..7a65e82712
--- /dev/null
+++ b/TargetLibraries/Snitch/inc/kernel/Add.h
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_ADD_KERNEL_HEADER_
+#define __DEEPLOY_MATH_ADD_KERNEL_HEADER_
+
+#include "DeeploySnitchMath.h"
+
+void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, uint32_t size);
+
+void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut,
+                        uint32_t *out_shape, uint32_t *strides1,
+                        uint32_t *strides2, uint32_t ndim, uint32_t size);
+
+void Add_fp32_lastdim(float32_t *pIn1, float32_t *pIn2, float32_t *pOut,
+                      uint32_t outer_size, uint32_t inner_size);
+
+#endif // __DEEPLOY_MATH_ADD_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/Div.h b/TargetLibraries/Snitch/inc/kernel/Div.h
new file mode 100644
index 0000000000..e9b257a634
--- /dev/null
+++ b/TargetLibraries/Snitch/inc/kernel/Div.h
@@ -0,0 +1,44 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_
+#define __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * Element-wise Division (FP32)
+ *
+ * Computes: output[i] = input1[i] / input2[i]
+ *
+ * input1:         Numerator tensor (float32)
+ * input2:         Denominator tensor (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements
+ *
+ * multi-core      = yes
+ * parallelization = element-wise
+ */
+void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output,
+              uint32_t size);
+
+/*
+ * Element-wise Division with scalar broadcasting (FP32)
+ *
+ * Computes: output[i] = input1[i] / scalar
+ *
+ * input1:         Numerator tensor (float32)
+ * scalar:         Scalar denominator (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements
+ *
+ * multi-core      = yes
+ * parallelization = element-wise
+ */
+void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output,
+                     uint32_t size);
+
+#endif // __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/HardSwish.h b/TargetLibraries/Snitch/inc/kernel/HardSwish.h
new file mode 100644
index 0000000000..a0cfdaac12
--- /dev/null
+++ b/TargetLibraries/Snitch/inc/kernel/HardSwish.h
@@ -0,0 +1,34 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_
+#define __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * HardSwish Activation Function
+ *
+ * Computes: HardSwish(x) = x * clip(x/6 + 0.5, 0, 1)
+ *
+ * Piecewise form:
+ *   - When x <= -3: output = 0
+ *   - When -3 < x < 3: output = x * (x/6 + 0.5)
+ *   - When x >= 3: output = x
+ *
+ * This is a computationally efficient approximation of Swish/SiLU activation
+ * commonly used in mobile neural networks and transformer models.
+ *
+ * data_in:  Input tensor (FP32)
+ * data_out: Output tensor (FP32, same shape as input)
+ * size:     Total number of elements
+ *
+ * multi-core      = yes
+ * parallelization = element-wise
+ */
+void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size);
+
+#endif // __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/Mul.h b/TargetLibraries/Snitch/inc/kernel/Mul.h
new file mode 100644
index 0000000000..d851e2e3bf
--- /dev/null
+++ b/TargetLibraries/Snitch/inc/kernel/Mul.h
@@ -0,0 +1,44 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_
+#define __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * Element-wise Multiplication (FP32)
+ *
+ * Computes: output[i] = input1[i] * input2[i]
+ *
+ * input1:         First input tensor (float32)
+ * input2:         Second input tensor (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements
+ *
+ * multi-core      = yes
+ * parallelization = element-wise
+ */
+void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output,
+              uint32_t size);
+
+/*
+ * Element-wise Multiplication with scalar broadcasting (FP32)
+ *
+ * Computes: output[i] = input1[i] * scalar
+ *
+ * input1:         Input tensor (float32)
+ * scalar:         Scalar multiplier (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements
+ *
+ * multi-core      = yes
+ * parallelization = element-wise
+ */
+void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output,
+                     uint32_t size);
+
+#endif // __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/RMSNrom.h b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h
new file mode 100644
index 0000000000..16e25cd38c
--- /dev/null
+++ b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * RMS Normalization (Root Mean Square Normalization)
+ *
+ * Computes: output[i] = (input[i] / rms) * weight[i]
+ * where rms = sqrt(mean(input^2) + eps)
+ *
+ * data_in:        Input tensor [batch, seq, hidden] or flattened [size]
+ * weight:         Weight tensor [hidden_dim]
+ * data_out:       Output tensor (same shape as input)
+ * size:           Total number of elements (batch * seq * hidden)
+ * lastDimLength:  Hidden dimension size
+ * eps:            Epsilon for numerical stability (typically 1e-6)
+ *
+ * multi-core      = yes
+ * parallelization = vector-wise (across batch * sequence)
+ */
+void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out,
+                  uint32_t size, uint32_t lastDimLength, float32_t eps);
+
+#endif // __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/Softmax.h b/TargetLibraries/Snitch/inc/kernel/Softmax.h
index c2d7596e7a..3795bb4f3b 100644
--- a/TargetLibraries/Snitch/inc/kernel/Softmax.h
+++ b/TargetLibraries/Snitch/inc/kernel/Softmax.h
@@ -9,7 +9,7 @@
 
 #include "DeeploySnitchMath.h"
 
-void softmax_fp32(float *input, float *output, int32_t ldI,
+void Softmax_fp32(float *input, float *output, int32_t ldI,
                   int32_t batch_offset, int32_t batch_size, int32_t seq_len,
                   int32_t input_samples);
 
diff --git a/TargetLibraries/Snitch/inc/macros.h b/TargetLibraries/Snitch/inc/macros.h
index bc1191d25a..04bef2394b 100644
--- a/TargetLibraries/Snitch/inc/macros.h
+++ b/TargetLibraries/Snitch/inc/macros.h
@@ -8,10 +8,19 @@
 #define __DEEPLOY_MATH_MACROS_HEADER_
 
 #define INT_LOG2(x) __builtin_ctz(x)
+
+#ifndef CLAMP
 #define CLAMP(x, low, high)                                                    \
   (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
+#endif
+
+#ifndef MIN
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef MAX
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
 
 // JUNGVI: The following macros are here to ensure compatibility with some
 // PULP-NN kernels
diff --git a/TargetLibraries/Snitch/src/Add_fp32.c b/TargetLibraries/Snitch/src/Add_fp32.c
new file mode 100644
index 0000000000..235b258511
--- /dev/null
+++ b/TargetLibraries/Snitch/src/Add_fp32.c
@@ -0,0 +1,102 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut,
+              uint32_t size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  uint32_t chunkSize = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start, num_elements;
+  if (core_id < remainder) {
+    num_elements = chunkSize + 1;
+    start = core_id * num_elements;
+  } else {
+    num_elements = chunkSize;
+    start = core_id * chunkSize + remainder;
+  }
+
+  uint32_t end = start + num_elements;
+
+  for (uint32_t i = start; i < end; i++) {
+    pOut[i] = pIn1[i] + pIn2[i];
+  }
+}
+
+void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut,
+                        uint32_t *out_shape, uint32_t *strides1,
+                        uint32_t *strides2, uint32_t ndim, uint32_t size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  uint32_t chunkSize = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start, num_elements;
+  if (core_id < remainder) {
+    num_elements = chunkSize + 1;
+    start = core_id * num_elements;
+  } else {
+    num_elements = chunkSize;
+    start = core_id * chunkSize + remainder;
+  }
+
+  uint32_t end = start + num_elements;
+
+  for (uint32_t i = start; i < end; i++) {
+    uint32_t idx1 = 0;
+    uint32_t idx2 = 0;
+    uint32_t tmp = i;
+
+    for (int32_t d = ndim - 1; d >= 0; d--) {
+      uint32_t coord = tmp % out_shape[d];
+      tmp /= out_shape[d];
+      idx1 += coord * strides1[d];
+      idx2 += coord * strides2[d];
+    }
+
+    pOut[i] = pIn1[idx1] + pIn2[idx2];
+  }
+}
+
+void Add_fp32_lastdim(float32_t *pIn1, float32_t *pIn2, float32_t *pOut,
+                      uint32_t outer_size, uint32_t inner_size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+  uint32_t size = outer_size * inner_size;
+
+  uint32_t chunkSize = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start, num_elements;
+  if (core_id < remainder) {
+    num_elements = chunkSize + 1;
+    start = core_id * num_elements;
+  } else {
+    num_elements = chunkSize;
+    start = core_id * chunkSize + remainder;
+  }
+
+  uint32_t end = start + num_elements;
+
+  for (uint32_t i = start; i < end; i++) {
+    uint32_t inner_idx = i % inner_size;
+    pOut[i] = pIn1[i] + pIn2[inner_idx];
+  }
+}
diff --git a/TargetLibraries/Snitch/src/CycleCounter.c b/TargetLibraries/Snitch/src/CycleCounter.c
index 3861c421c1..8a99c312e6 100644
--- a/TargetLibraries/Snitch/src/CycleCounter.c
+++ b/TargetLibraries/Snitch/src/CycleCounter.c
@@ -6,10 +6,15 @@
 
 #include "DeeploySnitchMath.h"
 
+// Define ENABLE_INSTR_COUNTER to enable instruction counting (causes warnings
+// in gvsoc) #define ENABLE_INSTR_COUNTER
+
 static uint32_t timer_init[NUM_CORES] __attribute__((section(".l1")));
 static uint32_t timer_end[NUM_CORES] __attribute__((section(".l1")));
+#ifdef ENABLE_INSTR_COUNTER
 static uint32_t instr_init[NUM_CORES] __attribute__((section(".l1")));
 static uint32_t instr_end[NUM_CORES] __attribute__((section(".l1")));
+#endif
 
 static uint32_t running[NUM_CORES] __attribute__((section(".l1")));
 
@@ -17,11 +22,13 @@ void ResetTimer() {
   snrt_reset_perf_counter(SNRT_PERF_CNT0);
   uint32_t const core_id = snrt_global_core_idx();
   uint32_t _timer_init = read_csr(mcycle);
-  uint32_t _instr_init = read_csr(minstret);
   timer_init[core_id] = _timer_init;
-  instr_init[core_id] = _instr_init;
   timer_end[core_id] = _timer_init;
+#ifdef ENABLE_INSTR_COUNTER
+  uint32_t _instr_init = read_csr(minstret);
+  instr_init[core_id] = _instr_init;
   instr_end[core_id] = _instr_init;
+#endif
   running[core_id] = 0;
 }
 
@@ -31,7 +38,9 @@ void StartTimer() {
   }
   uint32_t const core_id = snrt_global_core_idx();
   timer_init[core_id] = read_csr(mcycle);
+#ifdef ENABLE_INSTR_COUNTER
   instr_init[core_id] = read_csr(minstret);
+#endif
   running[core_id] = 1;
 }
 
@@ -41,7 +50,9 @@ void StopTimer() {
   }
   uint32_t const core_id = snrt_global_core_idx();
   timer_end[core_id] = read_csr(mcycle);
+#ifdef ENABLE_INSTR_COUNTER
   instr_end[core_id] = read_csr(minstret);
+#endif
   running[core_id] = 0;
 }
 
@@ -55,6 +66,7 @@ uint32_t getCycles() {
 }
 
 uint32_t getInstr(void) {
+#ifdef ENABLE_INSTR_COUNTER
   uint32_t const core_id = snrt_global_core_idx();
 
   if (running[core_id]) {
@@ -62,4 +74,7 @@ uint32_t getInstr(void) {
   } else {
     return instr_end[core_id] - instr_init[core_id];
   }
+#else
+  return 0; // Instruction counting disabled
+#endif
 }
\ No newline at end of file
diff --git a/TargetLibraries/Snitch/src/Div_fp32.c b/TargetLibraries/Snitch/src/Div_fp32.c
new file mode 100644
index 0000000000..07c3d3c5d4
--- /dev/null
+++ b/TargetLibraries/Snitch/src/Div_fp32.c
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * Element-wise Division (FP32)
+ *
+ * Computes: output[i] = input1[i] / input2[i]
+ *
+ * Supports ONNX broadcasting rules:
+ * - If input2 is scalar (size=1): divides all elements of input1 by input2[0]
+ * - If both have same size: element-wise division
+ *
+ * input1:         Numerator tensor (float32)
+ * input2:         Denominator tensor (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements in input1
+ *
+ * multi-core      = yes
+ * parallelization = element-wise across input1
+ */
+void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output,
+              uint32_t size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  // Parallelize across elements
+  uint32_t elements_per_core = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start_elem, num_elems;
+  if (core_id < remainder) {
+    num_elems = elements_per_core + 1;
+    start_elem = core_id * num_elems;
+  } else {
+    num_elems = elements_per_core;
+    start_elem = core_id * elements_per_core + remainder;
+  }
+
+  // Check if input2 is a scalar (size=1, broadcasted)
+  // Note: This assumes the parser has set input2_size correctly
+  // For now, we assume element-wise division (same size)
+  for (uint32_t i = start_elem; i < start_elem + num_elems; i++) {
+    output[i] = input1[i] / input2[i];
+  }
+}
+
+/*
+ * Element-wise Division with scalar broadcasting (FP32)
+ *
+ * Computes: output[i] = input1[i] / scalar
+ *
+ * input1:         Numerator tensor (float32)
+ * scalar:         Scalar denominator (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements in input1
+ *
+ * multi-core      = yes
+ * parallelization = element-wise
+ */
+void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output,
+                     uint32_t size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  uint32_t elements_per_core = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start_elem, num_elems;
+  if (core_id < remainder) {
+    num_elems = elements_per_core + 1;
+    start_elem = core_id * num_elems;
+  } else {
+    num_elems = elements_per_core;
+    start_elem = core_id * elements_per_core + remainder;
+  }
+
+  float32_t inv_scalar = 1.0f / scalar; // Compute inverse once
+
+  for (uint32_t i = start_elem; i < start_elem + num_elems; i++) {
+    output[i] = input1[i] * inv_scalar;
+  }
+}
diff --git a/TargetLibraries/Snitch/src/Gemm_fp32.c b/TargetLibraries/Snitch/src/Gemm_fp32.c
index 9a79538e12..8dac98ef67 100644
--- a/TargetLibraries/Snitch/src/Gemm_fp32.c
+++ b/TargetLibraries/Snitch/src/Gemm_fp32.c
@@ -11,231 +11,50 @@ void gemm_fp32_transB_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
                           uint32_t ldA, float32_t *B, uint32_t ldB,
                           float32_t *C, uint32_t ldC, float32_t *Y,
                           uint32_t BETA, uint32_t setup_SSR) {
+  (void)setup_SSR;
 
   uint32_t compute_id = snrt_global_compute_core_idx();
   uint32_t A_offset = K * compute_id;
   uint32_t C_offset = N * compute_id;
 
-  // Unrolling factor of most inner loop.
-  // Should be at least as high as the FMA delay
-  // for maximum utilization
-  const uint32_t unroll = 8;
-
-  // SSR strides and bounds only have to be configured
-  // once in the beginning
-  if (setup_SSR) {
-    // First matrix is not stored in transposed format
-    const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M};
-    const uint32_t ssr0_i[4] = {0, sizeof(float32_t), 0,
-                                sizeof(float32_t) * ldA};
-
-    // Second matrix is stored in transposed format
-    const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M};
-    const uint32_t ssr1_i[4] = {sizeof(float32_t) * K, sizeof(float32_t),
-                                sizeof(float32_t) * K * unroll, 0};
-
-    snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1],
-                     ssr0_i[2], ssr0_i[3]);
-
-    snrt_ssr_repeat(SNRT_SSR_DM0, unroll);
-    snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
-                     ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
-  }
-
-  // SSR start address need to be configured each time
-
-  snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, &A[A_offset]);
-  snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B);
-  snrt_ssr_enable();
-
-  // check dimensions and values of a and b
-
-  // Kernel progresses by 1 values each step
-  // const uint32_t n_frep = K - 1;
   for (uint32_t m = 0; m < M; m++) {
-    uint32_t n = 0;
-    for (uint32_t n0 = 0; n0 < N / unroll; n0++) {
-      float c[unroll];
-
-      // Load intermediate result
-      if (BETA) {
-        c[0] = C[C_offset + m * ldC + n + 0];
-        c[1] = C[C_offset + m * ldC + n + 1];
-        c[2] = C[C_offset + m * ldC + n + 2];
-        c[3] = C[C_offset + m * ldC + n + 3];
-        c[4] = C[C_offset + m * ldC + n + 4];
-        c[5] = C[C_offset + m * ldC + n + 5];
-        c[6] = C[C_offset + m * ldC + n + 6];
-        c[7] = C[C_offset + m * ldC + n + 7];
-      } else {
-        c[0] = 0.0;
-        c[1] = 0.0;
-        c[2] = 0.0;
-        c[3] = 0.0;
-        c[4] = 0.0;
-        c[5] = 0.0;
-        c[6] = 0.0;
-        c[7] = 0.0;
-      }
-
-      asm volatile(
-          "frep.o %[n_frep], 8, 0, 0 \n"
-          "fmadd.s %[c0], ft0, ft1, %[c0] \n"
-          "fmadd.s %[c1], ft0, ft1, %[c1] \n"
-          "fmadd.s %[c2], ft0, ft1, %[c2] \n"
-          "fmadd.s %[c3], ft0, ft1, %[c3] \n"
-          "fmadd.s %[c4], ft0, ft1, %[c4] \n"
-          "fmadd.s %[c5], ft0, ft1, %[c5] \n"
-          "fmadd.s %[c6], ft0, ft1, %[c6] \n"
-          "fmadd.s %[c7], ft0, ft1, %[c7] \n"
-          : [c0] "+f"(c[0]), [c1] "+f"(c[1]), [c2] "+f"(c[2]), [c3] "+f"(c[3]),
-            [c4] "+f"(c[4]), [c5] "+f"(c[5]), [c6] "+f"(c[6]), [c7] "+f"(c[7])
-          : [n_frep] "r"(K - 1)
-          : "ft0", "ft1", "ft2");
-
-      // Store results back
-      Y[C_offset + m * ldC + n + 0] = c[0];
-      Y[C_offset + m * ldC + n + 1] = c[1];
-      Y[C_offset + m * ldC + n + 2] = c[2];
-      Y[C_offset + m * ldC + n + 3] = c[3];
-      Y[C_offset + m * ldC + n + 4] = c[4];
-      Y[C_offset + m * ldC + n + 5] = c[5];
-      Y[C_offset + m * ldC + n + 6] = c[6];
-      Y[C_offset + m * ldC + n + 7] = c[7];
-      n += unroll;
-    }
-
-    // Clean up of leftover columns
-    snrt_ssr_disable();
-    for (; n < N; n++) {
+    for (uint32_t n = 0; n < N; n++) {
       float32_t c;
       if (BETA) {
         c = C[C_offset + m * ldC + n];
       } else {
-        c = 0.0;
+        c = 0.0f;
       }
       for (uint32_t k = 0; k < K; k++) {
-        c += A[A_offset + k + m * ldA] * B[k + n * ldB];
+        c += A[A_offset + m * ldA + k] * B[n * ldB + k];
       }
       Y[C_offset + m * ldC + n] = c;
     }
-    snrt_ssr_enable();
   }
-  snrt_ssr_disable();
 }
 
 void gemm_fp32_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
                    uint32_t ldA, float32_t *B, uint32_t ldB, float32_t *C,
                    uint32_t ldC, float32_t *Y, uint32_t BETA,
                    uint32_t setup_SSR) {
+  (void)setup_SSR;
+
   uint32_t compute_id = snrt_global_compute_core_idx();
   uint32_t A_offset = K * compute_id;
   uint32_t C_offset = N * compute_id;
 
-  // Unrolling factor of most inner loop.
-  // Should be at least as high as the FMA delay
-  // for maximum utilization
-  const uint32_t unroll = 8;
-
-  // SSR strides and bounds only have to be configured
-  // once in the beginning
-  if (setup_SSR) {
-    // First matrix is not stored in transposed format
-    const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M};
-    const uint32_t ssr0_i[4] = {0, sizeof(float32_t), 0,
-                                sizeof(float32_t) * ldA};
-
-    // Second matrix is not stored in transposed format
-    const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M};
-    const uint32_t ssr1_i[4] = {sizeof(float32_t), sizeof(float32_t) * ldB,
-                                sizeof(float32_t) * unroll, 0};
-
-    snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1],
-                     ssr0_i[2], ssr0_i[3]);
-
-    snrt_ssr_repeat(SNRT_SSR_DM0, unroll);
-    snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3],
-                     ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]);
-  }
-
-  // SSR start address need to be configured each time
-
-  snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, &A[A_offset]);
-  snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B);
-  snrt_ssr_enable();
-
-  // check dimensions and values of a and b
-
-  // Kernel progresses by 1 values each step
-  // const uint32_t n_frep = K - 1;
   for (uint32_t m = 0; m < M; m++) {
-    uint32_t n = 0;
-    for (uint32_t n0 = 0; n0 < N / unroll; n0++) {
-      float c[unroll];
-
-      // Load intermediate result
-      if (BETA) {
-        c[0] = C[C_offset + m * ldC + n + 0];
-        c[1] = C[C_offset + m * ldC + n + 1];
-        c[2] = C[C_offset + m * ldC + n + 2];
-        c[3] = C[C_offset + m * ldC + n + 3];
-        c[4] = C[C_offset + m * ldC + n + 4];
-        c[5] = C[C_offset + m * ldC + n + 5];
-        c[6] = C[C_offset + m * ldC + n + 6];
-        c[7] = C[C_offset + m * ldC + n + 7];
-      } else {
-        c[0] = 0.0;
-        c[1] = 0.0;
-        c[2] = 0.0;
-        c[3] = 0.0;
-        c[4] = 0.0;
-        c[5] = 0.0;
-        c[6] = 0.0;
-        c[7] = 0.0;
-      }
-
-      asm volatile(
-          "frep.o %[n_frep], 8, 0, 0 \n"
-          "fmadd.s %[c0], ft0, ft1, %[c0] \n"
-          "fmadd.s %[c1], ft0, ft1, %[c1] \n"
-          "fmadd.s %[c2], ft0, ft1, %[c2] \n"
-          "fmadd.s %[c3], ft0, ft1, %[c3] \n"
-          "fmadd.s %[c4], ft0, ft1, %[c4] \n"
-          "fmadd.s %[c5], ft0, ft1, %[c5] \n"
-          "fmadd.s %[c6], ft0, ft1, %[c6] \n"
-          "fmadd.s %[c7], ft0, ft1, %[c7] \n"
-          : [c0] "+f"(c[0]), [c1] "+f"(c[1]), [c2] "+f"(c[2]), [c3] "+f"(c[3]),
-            [c4] "+f"(c[4]), [c5] "+f"(c[5]), [c6] "+f"(c[6]), [c7] "+f"(c[7])
-          : [n_frep] "r"(K - 1)
-          : "ft0", "ft1", "ft2");
-
-      // Store results back
-      Y[C_offset + m * ldC + n + 0] = c[0];
-      Y[C_offset + m * ldC + n + 1] = c[1];
-      Y[C_offset + m * ldC + n + 2] = c[2];
-      Y[C_offset + m * ldC + n + 3] = c[3];
-      Y[C_offset + m * ldC + n + 4] = c[4];
-      Y[C_offset + m * ldC + n + 5] = c[5];
-      Y[C_offset + m * ldC + n + 6] = c[6];
-      Y[C_offset + m * ldC + n + 7] = c[7];
-      n += unroll;
-    }
-
-    // Clean up of leftover columns
-    snrt_ssr_disable();
-    for (; n < N; n++) {
+    for (uint32_t n = 0; n < N; n++) {
       float32_t c;
       if (BETA) {
         c = C[C_offset + m * ldC + n];
       } else {
-        c = 0.0;
+        c = 0.0f;
       }
       for (uint32_t k = 0; k < K; k++) {
-        c += A[A_offset + k + m * ldA] * B[k * ldB + n];
+        c += A[A_offset + m * ldA + k] * B[k * ldB + n];
       }
       Y[C_offset + m * ldC + n] = c;
     }
-    snrt_ssr_enable();
   }
-  snrt_ssr_disable();
 }
diff --git a/TargetLibraries/Snitch/src/HardSwish.c b/TargetLibraries/Snitch/src/HardSwish.c
new file mode 100644
index 0000000000..b7e9679c64
--- /dev/null
+++ b/TargetLibraries/Snitch/src/HardSwish.c
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeploySnitchMath.h"
+
+void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  // Parallelize by dividing work across cores
+  uint32_t chunk_size = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start, end;
+  if (core_id < remainder) {
+    chunk_size += 1;
+    start = core_id * chunk_size;
+  } else {
+    start = core_id * chunk_size + remainder;
+  }
+  end = start + chunk_size;
+
+  // HardSwish(x) = x * clip(x/6 + 0.5, 0, 1)
+  // Piecewise:
+  //   x <= -3: output = 0
+  //   -3 < x < 3: output = x * (x/6 + 0.5)
+  //   x >= 3: output = x
+
+  for (uint32_t i = start; i < end; i++) {
+    float32_t x = data_in[i];
+    float32_t clip_val = x / 6.0f + 0.5f;
+
+    // Clamp to [0, 1]
+    if (clip_val < 0.0f) {
+      clip_val = 0.0f;
+    } else if (clip_val > 1.0f) {
+      clip_val = 1.0f;
+    }
+
+    data_out[i] = x * clip_val;
+  }
+}
diff --git a/TargetLibraries/Snitch/src/Mul_fp32.c b/TargetLibraries/Snitch/src/Mul_fp32.c
new file mode 100644
index 0000000000..80d6bc9b33
--- /dev/null
+++ b/TargetLibraries/Snitch/src/Mul_fp32.c
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeploySnitchMath.h"
+
+/*
+ * Element-wise Multiplication (FP32)
+ *
+ * Computes: output[i] = input1[i] * input2[i]
+ *
+ * Supports ONNX broadcasting rules:
+ * - If input2 is scalar (size=1): multiplies all elements of input1 by
+ * input2[0]
+ * - If both have same size: element-wise multiplication
+ *
+ * input1:         First input tensor (float32)
+ * input2:         Second input tensor (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements in input1
+ *
+ * multi-core      = yes
+ * parallelization = element-wise across input1
+ */
+void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output,
+              uint32_t size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  // Parallelize across elements
+  uint32_t elements_per_core = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start_elem, num_elems;
+  if (core_id < remainder) {
+    num_elems = elements_per_core + 1;
+    start_elem = core_id * num_elems;
+  } else {
+    num_elems = elements_per_core;
+    start_elem = core_id * elements_per_core + remainder;
+  }
+
+  // Element-wise multiplication
+  for (uint32_t i = start_elem; i < start_elem + num_elems; i++) {
+    output[i] = input1[i] * input2[i];
+  }
+}
+
+/*
+ * Element-wise Multiplication with scalar broadcasting (FP32)
+ *
+ * Computes: output[i] = input1[i] * scalar
+ *
+ * input1:         Input tensor (float32)
+ * scalar:         Scalar multiplier (float32)
+ * output:         Output tensor (same shape as input1)
+ * size:           Total number of elements in input1
+ *
+ * multi-core      = yes
+ * parallelization = element-wise
+ */
+void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output,
+                     uint32_t size) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  uint32_t elements_per_core = size / numThreads;
+  uint32_t remainder = size % numThreads;
+
+  uint32_t start_elem, num_elems;
+  if (core_id < remainder) {
+    num_elems = elements_per_core + 1;
+    start_elem = core_id * num_elems;
+  } else {
+    num_elems = elements_per_core;
+    start_elem = core_id * elements_per_core + remainder;
+  }
+
+  for (uint32_t i = start_elem; i < start_elem + num_elems; i++) {
+    output[i] = input1[i] * scalar;
+  }
+}
diff --git a/TargetLibraries/Snitch/src/RMSNrom_fp32.c b/TargetLibraries/Snitch/src/RMSNrom_fp32.c
new file mode 100644
index 0000000000..9c615ce923
--- /dev/null
+++ b/TargetLibraries/Snitch/src/RMSNrom_fp32.c
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeploySnitchMath.h"
+#include <math.h>
+
+void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out,
+                  uint32_t size, uint32_t lastDimLength, float32_t eps) {
+
+  uint32_t core_id = snrt_global_compute_core_idx();
+  uint32_t numThreads = snrt_global_compute_core_num();
+
+  uint32_t num_vectors = size / lastDimLength;
+
+  // Parallelize across vectors (batch * sequence dimension)
+  uint32_t vectors_per_core = num_vectors / numThreads;
+  uint32_t remainder = num_vectors % numThreads;
+
+  uint32_t start_vec, num_vecs;
+  if (core_id < remainder) {
+    num_vecs = vectors_per_core + 1;
+    start_vec = core_id * num_vecs;
+  } else {
+    num_vecs = vectors_per_core;
+    start_vec = core_id * vectors_per_core + remainder;
+  }
+
+  for (uint32_t v = start_vec; v < start_vec + num_vecs; v++) {
+    float32_t *in_ptr = data_in + v * lastDimLength;
+    float32_t *out_ptr = data_out + v * lastDimLength;
+
+    // Compute sum of squares
+    float32_t sum_sq = 0.0f;
+    for (uint32_t i = 0; i < lastDimLength; i++) {
+      sum_sq += in_ptr[i] * in_ptr[i];
+    }
+
+    // Compute RMS with epsilon
+    float32_t rms = sqrtf(sum_sq / (float32_t)lastDimLength + eps);
+    float32_t inv_rms = 1.0f / rms;
+
+    // Apply normalization and weight
+    for (uint32_t i = 0; i < lastDimLength; i++) {
+      out_ptr[i] = in_ptr[i] * inv_rms * weight[i];
+    }
+  }
+}