diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e88b77..a0d9fe9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,14 +13,12 @@ if(CCACHE_PROGRAM)
 endif()
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/utils")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/dependencies")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-# The CUDA standard is still C++14 to enable interopability with
-# slightly older and still well-supported versions of CUDA/nvcc
-# (e.g. CUDA < 11). This will be bumped to 17 once CUDA 11 is
-# required.
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 20)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 # no modules in this library
diff --git a/Folder.DotSettings b/Folder.DotSettings
new file mode 100644
index 0000000..ea9fb4b
--- /dev/null
+++ b/Folder.DotSettings
@@ -0,0 +1,6 @@
+﻿<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
+	<s:String x:Key="/Default/CodeStyle/Naming/CppNamingOptions/Rules/=0B233A6C23E887458E5DB7357199AE90/@EntryIndexedValue">&lt;NamingElement Priority="6" Title="Parameters"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="function parameter" /&gt;&lt;type Name="lambda parameter" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" WarnAboutPrefixesAndSuffixes="False" Prefix="" Suffix="" Style="aaBb"&gt;&lt;ExtraRule Prefix="_" Suffix="" Style="aaBb" /&gt;&lt;/Policy&gt;&lt;/NamingElement&gt;</s:String>
+	<s:String x:Key="/Default/CodeStyle/Naming/CppNamingOptions/Rules/=2B232F1067F0324F8FF4B9D63ACECDB2/@EntryIndexedValue">&lt;NamingElement Priority="16" Title="Other constants"&gt;&lt;Descriptor Static="True" Constexpr="Indeterminate" Const="True" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="class field" /&gt;&lt;type Name="local variable" /&gt;&lt;type Name="struct field" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" WarnAboutPrefixesAndSuffixes="False" Prefix="" Suffix="" Style="AA_BB"&gt;&lt;ExtraRule Prefix="" Suffix="" Style="aa_bb" /&gt;&lt;/Policy&gt;&lt;/NamingElement&gt;</s:String>
+	<s:String x:Key="/Default/CodeStyle/Naming/CppNamingOptions/Rules/=4203BE6F332C5149B409B4D5F7197E54/@EntryIndexedValue">&lt;NamingElement Priority="15" Title="Enumerators"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="scoped enumerator" /&gt;&lt;type Name="unscoped enumerator" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" WarnAboutPrefixesAndSuffixes="False" Prefix="" Suffix="" Style="AA_BB"&gt;&lt;ExtraRule Prefix="" Suffix="" Style="aa_bb" /&gt;&lt;/Policy&gt;&lt;/NamingElement&gt;</s:String>
+	<s:String x:Key="/Default/CodeStyle/Naming/CppNamingOptions/Rules/=5A3A59A8A6E157428C465B5C06F8D8E0/@EntryIndexedValue">&lt;NamingElement Priority="3" Title="Enums"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="enum" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="True" WarnAboutPrefixesAndSuffixes="False" Prefix="" Suffix="" Style="AaBb_AaBb"&gt;&lt;ExtraRule Prefix="" Suffix="" Style="aa_bb" /&gt;&lt;/Policy&gt;&lt;/NamingElement&gt;</s:String>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=Thomann/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
\ No newline at end of file
diff --git a/cmake/dependencies/FindCUDNN.cmake b/cmake/dependencies/FindCUDNN.cmake
index a150310..fd77eea 100644
--- a/cmake/dependencies/FindCUDNN.cmake
+++ b/cmake/dependencies/FindCUDNN.cmake
@@ -76,4 +76,8 @@ if(CUDNN_FOUND)
   endif()
 endif()
 
+if (CUDNN_FOUND AND CUDNN_VERSION VERSION_LESS "8.0")
+  message(FATAL_ERROR "Flashlight requires cuDNN >= 8.0, found ${CUDNN_VERSION}")
+endif()
+
 mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY CUDNN_VERSION)
diff --git a/cmake/utils/flashlightConfig.cmake.in b/cmake/utils/flashlightConfig.cmake.in
index 2bf2550..9d73423 100644
--- a/cmake/utils/flashlightConfig.cmake.in
+++ b/cmake/utils/flashlightConfig.cmake.in
@@ -49,7 +49,7 @@ if (@FL_BUILD_STANDALONE@)
   endif()
   if (@FL_USE_CUDA@)
     if (@FL_USE_CUDNN@)
-      find_dependency(CUDNN 7.1)
+      find_dependency(CUDNN 8)
     endif()
     if (@FL_BUILD_DISTRIBUTED@)
       find_dependency(NCCL)
diff --git a/cmake/utils/fm_target_utilities.cmake b/cmake/utils/fm_target_utilities.cmake
index c3ad681..f5310d7 100644
--- a/cmake/utils/fm_target_utilities.cmake
+++ b/cmake/utils/fm_target_utilities.cmake
@@ -53,10 +53,17 @@ function(fm_glob OUT_VAR)
         set(GLOB_PATTERNS ${ARG_PATTERNS})
     endif()
 
-    if(GLOB_PATTERNS)
+    # Normalize paths to prevent CONFIGURE_DEPENDS cache mismatch issues on Windows
+    set(NORMALIZED_PATTERNS "")
+    foreach(PATTERN IN LISTS GLOB_PATTERNS)
+        cmake_path(ABSOLUTE_PATH PATTERN NORMALIZE OUTPUT_VARIABLE NORMALIZED)
+        list(APPEND NORMALIZED_PATTERNS "${NORMALIZED}")
+    endforeach()
+
+    if(NORMALIZED_PATTERNS)
         file(GLOB_RECURSE FOUND_FILES
             CONFIGURE_DEPENDS
-            ${GLOB_PATTERNS}
+            ${NORMALIZED_PATTERNS}
         )
         set(${OUT_VAR} ${${OUT_VAR}} ${FOUND_FILES} PARENT_SCOPE)
     endif()
diff --git a/flashlight/fl/autograd/Functions.cpp b/flashlight/fl/autograd/Functions.cpp
index 70b7a9e..3eac143 100644
--- a/flashlight/fl/autograd/Functions.cpp
+++ b/flashlight/fl/autograd/Functions.cpp
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #include <algorithm>
@@ -24,7 +24,7 @@
 namespace fl {
 namespace detail {
 
-    Tensor tileAs(const Tensor& input, const Shape& rdims) {
+    Tensor tileAs(Tensor const& input, Shape const& rdims) {
         // Scalar tensor
         if(input.ndim() == 0)
             return tile(input, rdims);
@@ -36,7 +36,7 @@ namespace detail {
             if(rdims[i] % idimsSize != 0) {
                 std::stringstream ss;
                 ss << "Invalid dims for tileAs for input dims " << idims
-                   << " to output dims " << rdims;
+                    << " to output dims " << rdims;
                 throw std::invalid_argument(ss.str());
             }
             dims[i] = rdims[i] / idimsSize;
@@ -44,19 +44,19 @@ namespace detail {
         return tile(input, dims);
     }
 
-    Tensor sumAs(const Tensor& input, const Shape& rdims) {
+    Tensor sumAs(Tensor const& input, Shape const& rdims) {
         Shape idims = input.shape();
         auto result = input;
         for(int i = 0; i < input.ndim(); i++)
             if(i + 1 > rdims.ndim() || idims[i] != rdims[i])
                 result = fl::sum(result, {i}, /* keepDims = */ true);
 
-        return fl::reshape(result.astype(input.type()), rdims);
+        return fl::reshape(result.asType(input.type()), rdims);
     }
 
     Shape expandedShapeFromReducedDims(
-        const Tensor& input,
-        const std::vector<int>& axes,
+        Tensor const& input,
+        std::vector<int> const& axes,
         bool keepDims /* = false */
     ) {
         // Fast path - tensor already retained its shape
@@ -72,7 +72,7 @@ namespace detail {
         unsigned inputIdx = 0;
         for(unsigned i = 0; i < preNDims; ++i) {
             if(i == axes[axesIdx])
-                // This dim was reduced over, leave as 1 in the new shape
+            // This dim was reduced over, leave as 1 in the new shape
                 axesIdx++;
             else {
                 // Dim wasn't reduced over - add the shape from the new tensor
@@ -83,10 +83,10 @@ namespace detail {
         return newShape;
     }
 
-// TODO: remove these/use a simple template
+    // TODO: remove these/use a simple template
     Variable expandFromReduction(
-        const Variable& input,
-        const std::vector<int>& axes,
+        Variable const& input,
+        std::vector<int> const& axes,
         bool keepDims
     ) {
         return moddims(
@@ -96,8 +96,8 @@ namespace detail {
     }
 
     Tensor expandFromReduction(
-        const Tensor& input,
-        const std::vector<int>& axes,
+        Tensor const& input,
+        std::vector<int> const& axes,
         bool keepDims
     ) {
         auto o = expandedShapeFromReducedDims(input, axes, keepDims);
@@ -107,75 +107,87 @@ namespace detail {
         );
     }
 
-    bool areVariableTypesEqual(const Variable& a, const Variable& b) { return a.type() == b.type(); }
+    bool areVariableTypesEqual(Variable const& a, Variable const& b) { return a.type() == b.type(); }
 
 } // namespace detail
 
-Variable operator+(const Variable& lhs, const Variable& rhs) {
+Variable operator+(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() + rhs.tensor();
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(gradOutput.tensor(), false));
-            inputs[1].addGrad(Variable(gradOutput.tensor(), false));
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(Variable(gradOutput.tensor(), false));
+        inputs[1].addGrad(Variable(gradOutput.tensor(), false));
+    };
     return Variable(result, {lhs.withoutData(), rhs.withoutData()}, gradFunc);
 }
 
-Variable operator+(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() + rhsVal).astype(lhs.type());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(gradOutput.tensor(), false));
-        };
+Variable operator+(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() + rhsVal).asType(lhs.type());
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(Variable(gradOutput.tensor(), false));
+    };
     return Variable(result, {lhs.withoutData()}, gradFunc);
 }
 
-Variable operator+(const double& lhsVal, const Variable& rhs) { return rhs + lhsVal; }
+Variable operator+(double const& lhsVal, Variable const& rhs) { return rhs + lhsVal; }
 
-Variable operator-(const Variable& lhs, const Variable& rhs) {
+Variable operator-(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() - rhs.tensor();
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(gradOutput.tensor(), false));
-            inputs[1].addGrad(Variable(negate(gradOutput).tensor(), false));
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(Variable(gradOutput.tensor(), false));
+        inputs[1].addGrad(Variable(negate(gradOutput).tensor(), false));
+    };
     return Variable(result, {lhs.withoutData(), rhs.withoutData()}, gradFunc);
 }
 
-Variable operator-(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() - rhsVal).astype(lhs.type());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(gradOutput.tensor(), false));
-        };
+Variable operator-(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() - rhsVal).asType(lhs.type());
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(Variable(gradOutput.tensor(), false));
+    };
     return Variable(result, {lhs.withoutData()}, gradFunc);
 }
 
-Variable operator-(const double& lhsVal, const Variable& rhs) {
-    auto result = (lhsVal - rhs.tensor()).astype(rhs.type());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(negate(gradOutput).tensor(), false));
-        };
+Variable operator-(double const& lhsVal, Variable const& rhs) {
+    auto result = (lhsVal - rhs.tensor()).asType(rhs.type());
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(Variable(negate(gradOutput).tensor(), false));
+    };
     return Variable(result, {rhs.withoutData()}, gradFunc);
 }
 
-Variable operator*(const Variable& lhs, const Variable& rhs) {
+Variable operator*(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() * rhs.tensor();
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            if(inputs[0].isCalcGrad())
-                inputs[0].addGrad(
-                    Variable(gradOutput.tensor() * inputs[1].tensor(), false)
-                );
-            if(inputs[1].isCalcGrad())
-                inputs[1].addGrad(
-                    Variable(gradOutput.tensor() * inputs[0].tensor(), false)
-                );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        if(inputs[0].isCalcGrad())
+            inputs[0].addGrad(
+                Variable(gradOutput.tensor() * inputs[1].tensor(), false)
+            );
+        if(inputs[1].isCalcGrad())
+            inputs[1].addGrad(
+                Variable(gradOutput.tensor() * inputs[0].tensor(), false)
+            );
+    };
     return Variable(
         result,
         {
@@ -186,34 +198,35 @@ Variable operator*(const Variable& lhs, const Variable& rhs) {
     );
 }
 
-Variable operator*(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() * rhsVal).astype(lhs.type());
-    auto gradFunc =
-        [rhsVal](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(gradOutput.tensor() * rhsVal, false));
-        };
+Variable operator*(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() * rhsVal).asType(lhs.type());
+    auto gradFunc = [rhsVal](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        inputs[0].addGrad(Variable(gradOutput.tensor() * rhsVal, false));
+    };
     return Variable(result, {lhs.withoutData()}, gradFunc);
 }
 
-Variable operator*(const double& lhsVal, const Variable& rhs) { return rhs * lhsVal; }
+Variable operator*(double const& lhsVal, Variable const& rhs) { return rhs * lhsVal; }
 
-Variable operator/(const Variable& lhs, const Variable& rhs) {
+Variable operator/(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() / rhs.tensor();
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto inputs1rec = reciprocal(inputs[1]);
-            auto gradInput0 = gradOutput * inputs1rec;
-            if(inputs[0].isCalcGrad())
-                inputs[0].addGrad(Variable(gradInput0.tensor(), false));
-            if(inputs[1].isCalcGrad())
-                inputs[1].addGrad(
-                    Variable(
-                        (gradInput0 * negate(inputs[0]) * inputs1rec).tensor(),
-                        false
-                    )
-                );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        auto inputs1rec = reciprocal(inputs[1]);
+        auto gradInput0 = gradOutput * inputs1rec;
+        if(inputs[0].isCalcGrad())
+            inputs[0].addGrad(Variable(gradInput0.tensor(), false));
+        if(inputs[1].isCalcGrad())
+            inputs[1].addGrad(
+                Variable(
+                    (gradInput0 * negate(inputs[0]) * inputs1rec).tensor(),
+                    false
+                )
+            );
+    };
     return Variable(
         result,
         {rhs.isCalcGrad() ? lhs : lhs.withoutData(), rhs},
@@ -221,368 +234,395 @@ Variable operator/(const Variable& lhs, const Variable& rhs) {
     );
 }
 
-Variable operator/(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() / rhsVal).astype(lhs.type());
+Variable operator/(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() / rhsVal).asType(lhs.type());
     auto gradFunc =
-        [rhsVal](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            inputs[0].addGrad(Variable((gradOutput / rhsVal).tensor(), false));
-        };
+        [rhsVal](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        inputs[0].addGrad(Variable((gradOutput / rhsVal).tensor(), false));
+    };
     return Variable(result, {lhs.withoutData()}, gradFunc);
 }
 
-Variable operator/(const double& lhsVal, const Variable& rhs) {
-    auto result = (lhsVal / rhs.tensor()).astype(rhs.type());
+Variable operator/(double const& lhsVal, Variable const& rhs) {
+    auto result = (lhsVal / rhs.tensor()).asType(rhs.type());
     auto gradFunc = [lhsVal](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable(
-                    (gradOutput * (-lhsVal) / (inputs[0] * inputs[0])).tensor(),
-                    false
-                )
-            );
-        };
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable(
+                (gradOutput * (-lhsVal) / (inputs[0] * inputs[0])).tensor(),
+                false
+            )
+        );
+    };
     return Variable(result, {rhs}, gradFunc);
 }
 
-Variable operator>(const Variable& lhs, const Variable& rhs) {
+Variable operator>(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() > rhs.tensor();
     return Variable(result, false);
 }
 
-Variable operator>(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() > rhsVal).astype(lhs.type());
+Variable operator>(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() > rhsVal).asType(lhs.type());
     return Variable(result, false);
 }
 
-Variable operator>(const double& lhsVal, const Variable& rhs) {
-    auto result = (lhsVal > rhs.tensor()).astype(rhs.type());
+Variable operator>(double const& lhsVal, Variable const& rhs) {
+    auto result = (lhsVal > rhs.tensor()).asType(rhs.type());
     return Variable(result, false);
 }
 
-Variable operator<(const Variable& lhs, const Variable& rhs) {
+Variable operator<(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() < rhs.tensor();
     return Variable(result, false);
 }
 
-Variable operator<(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() < rhsVal).astype(lhs.type());
+Variable operator<(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() < rhsVal).asType(lhs.type());
     return Variable(result, false);
 }
 
-Variable operator<(const double& lhsVal, const Variable& rhs) {
-    auto result = (lhsVal < rhs.tensor()).astype(rhs.type());
+Variable operator<(double const& lhsVal, Variable const& rhs) {
+    auto result = (lhsVal < rhs.tensor()).asType(rhs.type());
     return Variable(result, false);
 }
 
-Variable operator>=(const Variable& lhs, const Variable& rhs) {
+Variable operator>=(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() >= rhs.tensor();
     return Variable(result, false);
 }
 
-Variable operator>=(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() >= rhsVal).astype(lhs.type());
+Variable operator>=(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() >= rhsVal).asType(lhs.type());
     return Variable(result, false);
 }
 
-Variable operator>=(const double& lhsVal, const Variable& rhs) {
-    auto result = (lhsVal >= rhs.tensor()).astype(rhs.type());
+Variable operator>=(double const& lhsVal, Variable const& rhs) {
+    auto result = (lhsVal >= rhs.tensor()).asType(rhs.type());
     return Variable(result, false);
 }
 
-Variable operator<=(const Variable& lhs, const Variable& rhs) {
+Variable operator<=(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() <= rhs.tensor();
     return Variable(result, false);
 }
 
-Variable operator<=(const Variable& lhs, const double& rhsVal) {
-    auto result = (lhs.tensor() <= rhsVal).astype(lhs.type());
+Variable operator<=(Variable const& lhs, double const& rhsVal) {
+    auto result = (lhs.tensor() <= rhsVal).asType(lhs.type());
     return Variable(result, false);
 }
 
-Variable operator<=(const double& lhsVal, const Variable& rhs) {
-    auto result = (lhsVal <= rhs.tensor()).astype(rhs.type());
+Variable operator<=(double const& lhsVal, Variable const& rhs) {
+    auto result = (lhsVal <= rhs.tensor()).asType(rhs.type());
     return Variable(result, false);
 }
 
-Variable operator&&(const Variable& lhs, const Variable& rhs) {
+Variable operator&&(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = lhs.tensor() && rhs.tensor();
     return Variable(result, false);
 }
 
-Variable operator!(const Variable& input) {
-    auto result = (!input.tensor()).astype(input.type());
+Variable operator!(Variable const& input) {
+    auto result = (!input.tensor()).asType(input.type());
     return Variable(result, false);
 }
 
-Variable max(const Variable& lhs, const Variable& rhs) {
+Variable max(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = fl::maximum(lhs.tensor(), rhs.tensor());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto mask = Variable(
-                (inputs[0].tensor() > inputs[1].tensor()).astype(gradOutput.type()),
-                false
-            );
-            inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
-            inputs[1].addGrad(Variable((!mask * gradOutput).tensor(), false));
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        auto mask = Variable(
+            (inputs[0].tensor() > inputs[1].tensor()).asType(gradOutput.type()),
+            false
+        );
+        inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
+        inputs[1].addGrad(Variable((!mask * gradOutput).tensor(), false));
+    };
     return Variable(result, {lhs, rhs}, gradFunc);
 }
 
-Variable max(const Variable& lhs, const double& rhsVal) {
-    auto result = fl::maximum(lhs.tensor(), rhsVal).astype(lhs.type());
+Variable max(Variable const& lhs, double const& rhsVal) {
+    auto result = fl::maximum(lhs.tensor(), rhsVal).asType(lhs.type());
     auto gradFunc =
-        [rhsVal](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            auto mask = Variable(
-                (inputs[0].tensor() > rhsVal).astype(gradOutput.type()),
-                false
-            );
-            inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
-        };
+        [rhsVal](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        auto mask = Variable(
+            (inputs[0].tensor() > rhsVal).asType(gradOutput.type()),
+            false
+        );
+        inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
+    };
     return Variable(result, {lhs}, gradFunc);
 }
 
-Variable max(const double& lhsVal, const Variable& rhs) { return max(rhs, lhsVal); }
+Variable max(double const& lhsVal, Variable const& rhs) { return max(rhs, lhsVal); }
 
-Variable min(const Variable& lhs, const Variable& rhs) {
+Variable min(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     auto result = fl::minimum(lhs.tensor(), rhs.tensor());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto mask = Variable(
-                (inputs[0].tensor() < inputs[1].tensor()).astype(gradOutput.type()),
-                false
-            );
-            inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
-            inputs[1].addGrad(Variable((!mask * gradOutput).tensor(), false));
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        auto mask = Variable(
+            (inputs[0].tensor() < inputs[1].tensor()).asType(gradOutput.type()),
+            false
+        );
+        inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
+        inputs[1].addGrad(Variable((!mask * gradOutput).tensor(), false));
+    };
     return Variable(result, {lhs, rhs}, gradFunc);
 }
 
-Variable min(const Variable& lhs, const double& rhsVal) {
-    auto result = fl::minimum(lhs.tensor(), rhsVal).astype(lhs.type());
+Variable min(Variable const& lhs, double const& rhsVal) {
+    auto result = fl::minimum(lhs.tensor(), rhsVal).asType(lhs.type());
     auto gradFunc =
-        [rhsVal](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            auto mask = Variable(
-                (inputs[0].tensor() < rhsVal).astype(gradOutput.type()),
-                false
-            );
-            inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
-        };
+        [rhsVal](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        auto mask = Variable(
+            (inputs[0].tensor() < rhsVal).asType(gradOutput.type()),
+            false
+        );
+        inputs[0].addGrad(Variable((mask * gradOutput).tensor(), false));
+    };
     return Variable(result, {lhs}, gradFunc);
 }
 
-Variable min(const double& lhsVal, const Variable& rhs) { return min(rhs, lhsVal); }
+Variable min(double const& lhsVal, Variable const& rhs) { return min(rhs, lhsVal); }
 
-Variable negate(const Variable& input) {
-    auto result = (0.0 - input.tensor()).astype(input.type());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(negate(gradOutput).tensor(), false));
-        };
+Variable negate(Variable const& input) {
+    auto result = (0.0 - input.tensor()).asType(input.type());
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(Variable(negate(gradOutput).tensor(), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable reciprocal(const Variable& input) {
+Variable reciprocal(Variable const& input) {
     auto result = 1.0 / FL_ADJUST_INPUT_TYPE(input.tensor());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto res = reciprocal(inputs[0]);
-            inputs[0].addGrad(
-                Variable((negate(gradOutput) * res * res).tensor(), false)
-            );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        auto res = reciprocal(inputs[0]);
+        inputs[0].addGrad(
+            Variable((negate(gradOutput) * res * res).tensor(), false)
+        );
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable exp(const Variable& input) {
+Variable exp(Variable const& input) {
     auto result = fl::exp(FL_ADJUST_INPUT_TYPE(input.tensor()));
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable(gradOutput.tensor() * fl::exp(inputs[0].tensor()), false)
-            );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable(gradOutput.tensor() * fl::exp(inputs[0].tensor()), false)
+        );
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable log(const Variable& input) {
+Variable log(Variable const& input) {
     auto result = fl::log(FL_ADJUST_INPUT_TYPE(input.tensor()));
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable((gradOutput.tensor() / inputs[0].tensor()), false)
-            );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable((gradOutput.tensor() / inputs[0].tensor()), false)
+        );
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable log1p(const Variable& input) {
+Variable log1p(Variable const& input) {
     auto result = fl::log1p(FL_ADJUST_INPUT_TYPE(input.tensor()));
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable((gradOutput.tensor() / (1.0 + inputs[0].tensor())), false)
-            );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable((gradOutput.tensor() / (1.0 + inputs[0].tensor())), false)
+        );
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable pow(const Variable& input, double p) {
+Variable pow(Variable const& input, double p) {
     auto result = fl::power(FL_ADJUST_INPUT_TYPE(input.tensor()), p);
-    auto gradFunc = [p](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            Tensor grad =
-                p * fl::power(inputs[0].tensor(), p - 1) * gradOutput.tensor();
-            inputs[0].addGrad(Variable(grad, false));
-        };
+    auto gradFunc = [p](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        Tensor grad =
+            p * fl::power(inputs[0].tensor(), p - 1) * gradOutput.tensor();
+        inputs[0].addGrad(Variable(grad, false));
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable sin(const Variable& input) {
+Variable sin(Variable const& input) {
     auto result = fl::sin(input.tensor());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable((gradOutput.tensor() * cos(inputs[0].tensor())), false)
-            );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable((gradOutput.tensor() * cos(inputs[0].tensor())), false)
+        );
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable cos(const Variable& input) {
+Variable cos(Variable const& input) {
     auto result = fl::cos(input.tensor());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable(
-                    (gradOutput.tensor() * negative(sin(inputs[0].tensor()))),
-                    false
-                )
-            );
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable(
+                (gradOutput.tensor() * negative(sin(inputs[0].tensor()))),
+                false
+            )
+        );
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable tanh(const Variable& input) {
+Variable tanh(Variable const& input) {
     auto result = fl::tanh(input.tensor());
     auto gradFunc =
-        [result](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            auto grad =
-                Variable((1.0 - result * result) * gradOutput.tensor(), false);
-            inputs[0].addGrad(Variable(grad.tensor(), false));
-        };
+        [result](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        auto grad =
+            Variable((1.0 - result * result) * gradOutput.tensor(), false);
+        inputs[0].addGrad(Variable(grad.tensor(), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable clamp(const Variable& input, const double lo, const double hi) {
+Variable clamp(Variable const& input, double const lo, double const hi) {
     auto result = fl::clip(input.tensor(), lo, hi);
     auto gradFunc = [lo, hi, result](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            Tensor gradMask = gradOutput.tensor();
-            gradMask = fl::where((result > lo) && (result < hi), gradMask, 0);
-            inputs[0].addGrad(Variable(gradMask, false));
-        };
+        Variable const& gradOutput
+    ) {
+        Tensor gradMask = gradOutput.tensor();
+        gradMask = fl::where((result > lo) && (result < hi), gradMask, 0);
+        inputs[0].addGrad(Variable(gradMask, false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable sqrt(const Variable& input) {
+Variable sqrt(Variable const& input) {
     auto result = fl::sqrt(input.tensor());
     auto gradFunc = [result](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto output = Variable(result, false);
-            inputs[0].addGrad(Variable((gradOutput / (2 * output)).tensor(), false));
-        };
+        Variable const& gradOutput
+    ) {
+        auto output = Variable(result, false);
+        inputs[0].addGrad(Variable((gradOutput / (2 * output)).tensor(), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable sigmoid(const Variable& input) {
+Variable sigmoid(Variable const& input) {
     auto result = fl::sigmoid(input.tensor());
     auto gradFunc =
-        [result](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            auto grad = gradOutput.tensor() * result * (1 - result);
-            inputs[0].addGrad(Variable(grad, false));
-        };
+        [result](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        auto grad = gradOutput.tensor() * result * (1 - result);
+        inputs[0].addGrad(Variable(grad, false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable swish(const Variable& input, double beta) { return input * sigmoid(beta * input); }
+Variable swish(Variable const& input, double beta) { return input * sigmoid(beta * input); }
 
-Variable erf(const Variable& input) {
+Variable erf(Variable const& input) {
     auto result = fl::erf(FL_ADJUST_INPUT_TYPE(input.tensor()));
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto x = inputs[0].tensor();
-            auto grad = gradOutput.tensor() * 2 / std::sqrt(M_PI) * fl::exp(-(x * x));
-            inputs[0].addGrad(Variable(grad, false));
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        auto x = inputs[0].tensor();
+        auto grad = gradOutput.tensor() * 2 / std::sqrt(M_PI) * fl::exp(-(x * x));
+        inputs[0].addGrad(Variable(grad, false));
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable transpose(const Variable& input, const Shape& dims /* = {} */) {
+Variable transpose(Variable const& input, Shape const& dims /* = {} */) {
     auto result = fl::transpose(input.tensor(), dims);
     auto gradFunc = [inputDims = input.shape(), ndim = input.ndim(), dims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            Shape reverseShape = dims;
-
-            if(dims.ndim()) {
-                // Reverse vec if transposing all dims (empty arg)
-                auto dVec = dims.get();
-                std::reverse(dVec.begin(), dVec.end());
-                reverseShape = Shape(dVec);
-            }
+        Variable const& gradOutput
+    ) {
+        Shape reverseShape = dims;
 
-            for(unsigned i = 0; i < reverseShape.ndim(); ++i)
-                reverseShape[dims[i]] = i;
+        if(dims.ndim()) {
+            // Reverse vec if transposing all dims (empty arg)
+            auto dVec = dims.get();
+            std::reverse(dVec.begin(), dVec.end());
+            reverseShape = Shape(dVec);
+        }
 
-            inputs[0].addGrad(
-                Variable(fl::transpose(gradOutput.tensor(), reverseShape), false)
-            );
-        };
+        for(unsigned i = 0; i < reverseShape.ndim(); ++i)
+            reverseShape[dims[i]] = i;
+
+        inputs[0].addGrad(
+            Variable(fl::transpose(gradOutput.tensor(), reverseShape), false)
+        );
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable tileAs(const Variable& input, const Shape& rdims) {
+Variable tileAs(Variable const& input, Shape const& rdims) {
     auto result = detail::tileAs(input.tensor(), rdims);
 
     Shape inDims = input.shape();
     auto gradFunc = [inDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable(
-                    sumAs(gradOutput, inDims).tensor().astype(inputs[0].type()),
-                    false
-                )
-            );
-        };
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable(
+                sumAs(gradOutput, inDims).tensor().asType(inputs[0].type()),
+                false
+            )
+        );
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable tileAs(const Variable& input, const Variable& reference) { return tileAs(input, reference.shape()); }
+Variable tileAs(Variable const& input, Variable const& reference) { return tileAs(input, reference.shape()); }
 
-Variable sumAs(const Variable& input, const Shape& rdims) {
+Variable sumAs(Variable const& input, Shape const& rdims) {
     auto result = detail::sumAs(FL_ADJUST_INPUT_TYPE(input.tensor()), rdims);
     auto idims = input.tensor().shape();
     auto gradFunc =
-        [idims](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(tileAs(gradOutput, idims).tensor(), false));
-        };
+        [idims](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        inputs[0].addGrad(Variable(tileAs(gradOutput, idims).tensor(), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable sumAs(const Variable& input, const Variable& reference) { return sumAs(input, reference.shape()); }
+Variable sumAs(Variable const& input, Variable const& reference) { return sumAs(input, reference.shape()); }
 
-Variable concatenate(const std::vector<Variable>& concatInputs, int dim) {
+Variable concatenate(std::vector<Variable> const& concatInputs, int dim) {
     if(concatInputs.empty())
         throw std::invalid_argument("cannot concatenate zero variables");
 
@@ -620,7 +660,7 @@ Variable concatenate(const std::vector<Variable>& concatInputs, int dim) {
     Tensor result(dims, concatInputs[0].type());
     std::vector<fl::Index> slice(numDims, fl::span);
     int start = 0;
-    for(const auto& input : concatInputs) {
+    for(auto const& input : concatInputs) {
         slice[dim] = fl::range({start, start + input.dim(dim)});
         result(slice) = input.tensor();
         start += input.dim(dim);
@@ -629,38 +669,39 @@ Variable concatenate(const std::vector<Variable>& concatInputs, int dim) {
     std::vector<Variable> inputsNoData;
     std::vector<Shape> inDims;
 
-    for(const auto& in : concatInputs) {
+    for(auto const& in : concatInputs) {
         inputsNoData.push_back(in.withoutData());
         inDims.push_back(in.shape());
     }
 
     auto gradFunc = [dim, inDims, numDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            std::vector<fl::Index> sx(numDims, fl::span);
-            int s = 0;
-            for(size_t i = 0; i < inputs.size(); ++i) {
-                sx[dim] = fl::range(s, s + inDims[i][dim]);
-                inputs[i].addGrad(Variable(gradOutput.tensor()(sx), false));
-                    s += inDims[i][dim];
-            }
-        };
+        Variable const& gradOutput
+    ) {
+        std::vector<fl::Index> sx(numDims, fl::span);
+        int s = 0;
+        for(size_t i = 0; i < inputs.size(); ++i) {
+            sx[dim] = fl::range(s, s + inDims[i][dim]);
+            inputs[i].addGrad(Variable(gradOutput.tensor()(sx), false));
+            s += inDims[i][dim];
+        }
+    };
 
     return Variable(result, inputsNoData, gradFunc);
 }
 
-std::vector<Variable> split(const Variable& input, long splitSize, int dim) {
+std::vector<Variable> split(Variable const& input, int64_t splitSize, int dim) {
     if(splitSize <= 0)
         throw std::invalid_argument("split size must be a positive integer");
     auto dimSize = input.dim(dim);
-    std::vector<long> splitSizes(dimSize / splitSize, splitSize);
+    std::vector splitSizes(dimSize / splitSize, splitSize);
 
     if(dimSize % splitSize > 0)
         splitSizes.push_back(dimSize % splitSize);
     return split(input, splitSizes, dim);
 }
 
-std::vector<Variable> split(const Variable& input, const std::vector<long>& splitSizes, int dim) {
+std::vector<Variable> split(Variable const& input, std::vector<int64_t> const& splitSizes, int dim) {
     if(dim >= input.ndim())
         throw std::invalid_argument(
             "split: passed dim is larger than the number of dimensions "
@@ -685,24 +726,24 @@ std::vector<Variable> split(const Variable& input, const std::vector<long>& spli
     return outputs;
 }
 
-Variable tile(const Variable& input, const Shape& dims) {
+Variable tile(Variable const& input, Shape const& dims) {
     Tensor result = fl::tile(input.tensor(), dims);
     Shape idims = input.shape();
     auto gradFunc =
-        [idims](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable(
-                    sumAs(gradOutput, idims).tensor().astype(inputs[0].type()),
-                    false
-                )
-            );
-        };
+        [idims](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        inputs[0].addGrad(
+            Variable(
+                sumAs(gradOutput, idims).tensor().asType(inputs[0].type()),
+                false
+            )
+        );
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
 Variable sum(
-    const Variable& input,
-    const std::vector<int>& axes,
+    Variable const& input,
+    std::vector<int> const& axes,
     bool keepDims /* = false*/
 ) {
     auto result = FL_ADJUST_INPUT_TYPE(input.tensor());
@@ -711,23 +752,24 @@ Variable sum(
     Shape indims = input.shape();
     auto gradFunc = [indims, axes, keepDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(
-                Variable(
-                    detail::tileAs(
-                        detail::expandFromReduction(gradOutput.tensor(), axes, keepDims),
-                        indims
-                    ),
-                    false
-                )
-            );
-        };
-    return Variable(result.astype(input.type()), {input.withoutData()}, gradFunc);
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(
+            Variable(
+                detail::tileAs(
+                    detail::expandFromReduction(gradOutput.tensor(), axes, keepDims),
+                    indims
+                ),
+                false
+            )
+        );
+    };
+    return Variable(result.asType(input.type()), {input.withoutData()}, gradFunc);
 }
 
 Variable mean(
-    const Variable& input,
-    const std::vector<int>& axes,
+    Variable const& input,
+    std::vector<int> const& axes,
     bool keepDims /* = false*/
 ) {
     auto result = FL_ADJUST_INPUT_TYPE(input.tensor());
@@ -736,38 +778,39 @@ Variable mean(
     Shape idims = input.shape();
     auto gradFunc = [idims, axes, keepDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            Shape odims = gradOutput.shape();
-            Dim count = 1;
-            for(int i = 0; i < idims.ndim(); i++) {
-                Dim odimSize = i + 1 > odims.ndim() ? 1 : odims[i];
-                count *= idims[i] / odimSize;
-            }
-            auto grad =
+        Variable const& gradOutput
+    ) {
+        Shape odims = gradOutput.shape();
+        Dim count = 1;
+        for(int i = 0; i < idims.ndim(); i++) {
+            Dim odimSize = i + 1 > odims.ndim() ? 1 : odims[i];
+            count *= idims[i] / odimSize;
+        }
+        auto grad =
+            detail::tileAs(
+                detail::expandFromReduction(gradOutput.tensor(), axes, keepDims),
+                idims
+            )
+            / count;
+        inputs[0].addGrad(
+            Variable(
                 detail::tileAs(
                     detail::expandFromReduction(gradOutput.tensor(), axes, keepDims),
                     idims
                 )
-                / count;
-            inputs[0].addGrad(
-                Variable(
-                    detail::tileAs(
-                        detail::expandFromReduction(gradOutput.tensor(), axes, keepDims),
-                        idims
-                    )
-                    / count,
-                    false
-                )
-            );
-        };
+                / count,
+                false
+            )
+        );
+    };
 
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
 Variable var(
-    const Variable& in,
-    const std::vector<int>& axes,
-    const bool isbiased /* = false */,
+    Variable const& in,
+    std::vector<int> const& axes,
+    bool const isbiased /* = false */,
     bool keepDims /* = false*/
 ) {
     Tensor input = FL_ADJUST_INPUT_TYPE(in.tensor());
@@ -785,30 +828,30 @@ Variable var(
     result = val * (result - n * avg * avg);
 
     auto gradFunc =
-        [val, axes](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            Shape expandedDims = inputs[0].shape();
-            Shape tileDims = inputs[0].shape();
-            for(auto ax : axes) {
-                tileDims[ax] = inputs[0].dim(ax);
-                expandedDims[ax] = 1;
-            }
+        [val, axes](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        Shape expandedDims = inputs[0].shape();
+        Shape tileDims = inputs[0].shape();
+        for(auto ax : axes) {
+            tileDims[ax] = inputs[0].dim(ax);
+            expandedDims[ax] = 1;
+        }
 
-            inputs[0].addGrad(
-                Variable(
-                    ((2 * val * tileAs(moddims(gradOutput, expandedDims), tileDims))
-                        * (inputs[0]
-                            - tileAs(moddims(mean(inputs[0], axes), expandedDims), tileDims)))
-                    .tensor(),
-                    false
-                )
-            );
-        };
+        inputs[0].addGrad(
+            Variable(
+                ((2 * val * tileAs(moddims(gradOutput, expandedDims), tileDims))
+                    * (inputs[0]
+                        - tileAs(moddims(mean(inputs[0], axes), expandedDims), tileDims)))
+                .tensor(),
+                false
+            )
+        );
+    };
     return Variable(result, {in}, gradFunc);
 }
 
 Variable norm(
-    const Variable& input,
-    const std::vector<int>& axes,
+    Variable const& input,
+    std::vector<int> const& axes,
     double p /* = 2 */,
     bool keepDims /* = false */
 ) {
@@ -823,25 +866,26 @@ Variable norm(
 
     auto gradFunc = [sumap, p, axes, keepDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            // correct, but less precise: auto gvar = Variable(fl::power(result, p - 1),
-            // false);
-            auto gvar = Variable(fl::power(sumap, 1 - 1 / p), false);
-            auto normGrad =
-                (inputs[0].tensor() * fl::pow(fl::abs(inputs[0]), p - 2).tensor()
-                    * detail::tileAs(
-                        detail::expandFromReduction(gradOutput.tensor(), axes, keepDims)
-                        / gvar.tensor(),
-                        inputs[0].shape()
-                ));
-            inputs[0].addGrad(Variable(normGrad, false));
-        };
+        Variable const& gradOutput
+    ) {
+        // correct, but less precise: auto gvar = Variable(fl::power(result, p - 1),
+        // false);
+        auto gvar = Variable(fl::power(sumap, 1 - 1 / p), false);
+        auto normGrad =
+        (inputs[0].tensor() * fl::pow(fl::abs(inputs[0]), p - 2).tensor()
+            * detail::tileAs(
+                detail::expandFromReduction(gradOutput.tensor(), axes, keepDims)
+                / gvar.tensor(),
+                inputs[0].shape()
+            ));
+        inputs[0].addGrad(Variable(normGrad, false));
+    };
     return Variable(result, {input}, gradFunc);
 }
 
 Variable normalize(
-    const Variable& in,
-    const std::vector<int>& axes,
+    Variable const& in,
+    std::vector<int> const& axes,
     double p /* = 2 */,
     double eps /* = 1e-12 */
 ) {
@@ -851,7 +895,7 @@ Variable normalize(
     return input / tileAs(invscale, input);
 }
 
-Variable matmul(const Variable& lhs, const Variable& rhs) {
+Variable matmul(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     // lhs:Input[0] -- [M, N]
     // rhs:Input[1] -- [N, K]
@@ -859,50 +903,55 @@ Variable matmul(const Variable& lhs, const Variable& rhs) {
     // -- matmul([M, N], [N, K]) --  [M, K]
     // result:gradOutput -- [M, K]
     auto result = fl::matmul(lhs.tensor(), rhs.tensor());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            if(inputs[0].isCalcGrad()) {
-                Tensor _lhs = gradOutput.tensor();
-                if(_lhs.ndim() == 1)
-                    _lhs = fl::reshape(_lhs, {1, _lhs.dim(0)});
-                Tensor _rhs = inputs[1].tensor();
-                if(_rhs.ndim() == 1)
-                    _rhs = fl::reshape(_rhs, {_rhs.dim(0), 1});
-
-                // matmulNT(gradOutput, inputs[1])
-                // -- matmulNT([M, K], [N, K])
-                // -- matmul([M, K], [K, N]) -- [M, K]
-                auto val = fl::matmul(
-                    _lhs,
-                    _rhs,
-                    /* lhsProp = */ MatrixProperty::None,
-                    /* rhsProp = */ MatrixProperty::Transpose
-                );
-                inputs[0].addGrad(Variable(detail::sumAs(val, inputs[0].shape()), false));
-            }
-            if(inputs[1].isCalcGrad()) {
-                Tensor _lhs = inputs[0].tensor();
-                if(_lhs.ndim() == 1)
-                    _lhs = fl::reshape(_lhs, {1, _lhs.dim(0)});
-                Tensor _rhs = gradOutput.tensor();
-                if(_rhs.ndim() == 1)
-                    _rhs = fl::reshape(_rhs, {_rhs.dim(0), 1});
-
-                // matmulTN(inputs[0], gradOutput)
-                // -- matmulTN([M, N], [M, K])
-                // -- matmul([N, M], [M, K]) -- [N, K]
-                auto val = fl::matmul(
-                    _lhs,
-                    _rhs,
-                    /* lhsProp = */ MatrixProperty::Transpose
-                );
-                inputs[1].addGrad(Variable(detail::sumAs(val, inputs[1].shape()), false));
-            }
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        if(inputs[0].isCalcGrad()) {
+            Tensor _lhs = gradOutput.tensor();
+            if(_lhs.ndim() == 1)
+                _lhs = fl::reshape(_lhs, {1, _lhs.dim(0)});
+            Tensor _rhs = inputs[1].tensor();
+            if(_rhs.ndim() == 1)
+                _rhs = fl::reshape(_rhs, {_rhs.dim(0), 1});
+
+            // matmulNT(gradOutput, inputs[1])
+            // -- matmulNT([M, K], [N, K])
+            // -- matmul([M, K], [K, N]) -- [M, K]
+            auto val = fl::matmul(
+                _lhs,
+                _rhs,
+                /* lhsProp = */
+                MatrixProperty::None,
+                /* rhsProp = */
+                MatrixProperty::Transpose
+            );
+            inputs[0].addGrad(Variable(detail::sumAs(val, inputs[0].shape()), false));
+        }
+        if(inputs[1].isCalcGrad()) {
+            Tensor _lhs = inputs[0].tensor();
+            if(_lhs.ndim() == 1)
+                _lhs = fl::reshape(_lhs, {1, _lhs.dim(0)});
+            Tensor _rhs = gradOutput.tensor();
+            if(_rhs.ndim() == 1)
+                _rhs = fl::reshape(_rhs, {_rhs.dim(0), 1});
+
+            // matmulTN(inputs[0], gradOutput)
+            // -- matmulTN([M, N], [M, K])
+            // -- matmul([N, M], [M, K]) -- [N, K]
+            auto val = fl::matmul(
+                _lhs,
+                _rhs,
+                /* lhsProp = */
+                MatrixProperty::Transpose
+            );
+            inputs[1].addGrad(Variable(detail::sumAs(val, inputs[1].shape()), false));
+        }
+    };
     return Variable(result, {lhs, rhs}, gradFunc);
 }
 
-Variable matmulTN(const Variable& lhs, const Variable& rhs) {
+Variable matmulTN(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     // lhs:Input[0] -- [N, M]
     // rhs:Input[1] -- [N, K]
@@ -912,34 +961,39 @@ Variable matmulTN(const Variable& lhs, const Variable& rhs) {
     // result:gradOutput -- [M, K]
     auto result = fl::matmul(
         lhs.tensor(),
-        rhs.tensor(), /* lhsProp = */
+        rhs.tensor(),
+        /* lhsProp = */
         MatrixProperty::Transpose
     );
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            if(inputs[0].isCalcGrad()) {
-                // matmulNT(inputs[1], gradOutput)
-                // -- matmulNT([N, K], [M, K])
-                // -- matmul([N, K], [K, M]) -- [N, M]
-                auto val = fl::matmul(
-                    inputs[1].tensor(),
-                    gradOutput.tensor(),
-                    /* lhsProp = */ MatrixProperty::None,
-                    /* rhsProp = */ MatrixProperty::Transpose
-                );
-                inputs[0].addGrad(Variable(detail::sumAs(val, inputs[0].shape()), false));
-            }
-            if(inputs[1].isCalcGrad()) {
-                // matmul(inputs[0], gradOutput)
-                // -- matmulNT([N, M], [M, K]) -- [N, K]
-                auto val = fl::matmul(inputs[0].tensor(), gradOutput.tensor());
-                inputs[1].addGrad(Variable(detail::sumAs(val, inputs[1].shape()), false));
-            }
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        if(inputs[0].isCalcGrad()) {
+            // matmulNT(inputs[1], gradOutput)
+            // -- matmulNT([N, K], [M, K])
+            // -- matmul([N, K], [K, M]) -- [N, M]
+            auto val = fl::matmul(
+                inputs[1].tensor(),
+                gradOutput.tensor(),
+                /* lhsProp = */
+                MatrixProperty::None,
+                /* rhsProp = */
+                MatrixProperty::Transpose
+            );
+            inputs[0].addGrad(Variable(detail::sumAs(val, inputs[0].shape()), false));
+        }
+        if(inputs[1].isCalcGrad()) {
+            // matmul(inputs[0], gradOutput)
+            // -- matmulNT([N, M], [M, K]) -- [N, K]
+            auto val = fl::matmul(inputs[0].tensor(), gradOutput.tensor());
+            inputs[1].addGrad(Variable(detail::sumAs(val, inputs[1].shape()), false));
+        }
+    };
     return Variable(result, {lhs, rhs}, gradFunc);
 }
 
-Variable matmulNT(const Variable& lhs, const Variable& rhs) {
+Variable matmulNT(Variable const& lhs, Variable const& rhs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(lhs, rhs);
     // lhs:Input[0] -- [M, N]
     // rhs:Input[1] -- [K, N]
@@ -950,54 +1004,61 @@ Variable matmulNT(const Variable& lhs, const Variable& rhs) {
     auto result = fl::matmul(
         lhs.tensor(),
         rhs.tensor(),
-        /* lhsProp = */ MatrixProperty::None,
-        /* rhsProp = */ MatrixProperty::Transpose
+        /* lhsProp = */
+        MatrixProperty::None,
+        /* rhsProp = */
+        MatrixProperty::Transpose
     );
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            if(inputs[0].isCalcGrad()) {
-                // matmul(gradOutput, inputs[1])
-                // -- matmul([M, K], [K, N]) -- [M, N]
-                auto val = fl::matmul(gradOutput.tensor(), inputs[1].tensor());
-                inputs[0].addGrad(Variable(detail::sumAs(val, inputs[0].shape()), false));
-            }
-            if(inputs[1].isCalcGrad()) {
-                // matmulTN(gradOutput, inputs[0])
-                // -- matmulTN([M, K], [M, N])
-                // -- matmul([K, M], [M, N]) -- [K, N]
-                auto val = fl::matmul(
-                    gradOutput.tensor(),
-                    inputs[0].tensor(),
-                    /* lhsProp = */ MatrixProperty::Transpose
-                );
-                inputs[1].addGrad(Variable(detail::sumAs(val, inputs[1].shape()), false));
-            }
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        if(inputs[0].isCalcGrad()) {
+            // matmul(gradOutput, inputs[1])
+            // -- matmul([M, K], [K, N]) -- [M, N]
+            auto val = fl::matmul(gradOutput.tensor(), inputs[1].tensor());
+            inputs[0].addGrad(Variable(detail::sumAs(val, inputs[0].shape()), false));
+        }
+        if(inputs[1].isCalcGrad()) {
+            // matmulTN(gradOutput, inputs[0])
+            // -- matmulTN([M, K], [M, N])
+            // -- matmul([K, M], [M, N]) -- [K, N]
+            auto val = fl::matmul(
+                gradOutput.tensor(),
+                inputs[0].tensor(),
+                /* lhsProp = */
+                MatrixProperty::Transpose
+            );
+            inputs[1].addGrad(Variable(detail::sumAs(val, inputs[1].shape()), false));
+        }
+    };
     return Variable(result, {lhs, rhs}, gradFunc);
 }
 
-Variable abs(const Variable& input) {
+Variable abs(Variable const& input) {
     auto result = fl::abs(input.tensor());
-    auto gradFunc = [](std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            // Convert it into -1, 0, 1
-            auto sign = fl::sign(inputs[0].tensor());
-            inputs[0].addGrad(Variable((sign * gradOutput.tensor()), false));
-        };
+    auto gradFunc = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        // Convert it into -1, 0, 1
+        auto sign = fl::sign(inputs[0].tensor());
+        inputs[0].addGrad(Variable((sign * gradOutput.tensor()), false));
+    };
     return Variable(result, {input}, gradFunc);
 }
 
-Variable flat(const Variable& input) {
+Variable flat(Variable const& input) {
     auto result = input.tensor().flatten();
     Shape idims = input.shape();
     auto gradFunc =
-        [idims](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(reshape(gradOutput.tensor(), idims), false));
-        };
+        [idims](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        inputs[0].addGrad(Variable(reshape(gradOutput.tensor(), idims), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable moddims(const Variable& input, const Shape& dims) {
+Variable moddims(Variable const& input, Shape const& dims) {
     if(input.ndim() == 0)
         return input;
     Shape inferDims = dims;
@@ -1036,13 +1097,14 @@ Variable moddims(const Variable& input, const Shape& dims) {
     Shape inDims = input.shape();
     auto gradFunc = [inDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(moddims(gradOutput, inDims).tensor(), false));
-        };
+        Variable const& gradOutput
+    ) {
+        inputs[0].addGrad(Variable(moddims(gradOutput, inDims).tensor(), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable softmax(const Variable& input, const int dim) {
+Variable softmax(Variable const& input, int const dim) {
     Tensor inputArr = FL_ADJUST_INPUT_TYPE(input.tensor());
     auto maxvals = amax(inputArr, {dim}, /* keepDims = */ true);
     Shape tiledims(std::vector<Dim>(input.ndim(), 1));
@@ -1055,17 +1117,18 @@ Variable softmax(const Variable& input, const int dim) {
     fl::eval(result);
     auto gradFunc = [dim, tiledims, result](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto rbyg = gradOutput.tensor() * result;
-            auto gradSm = rbyg
-                - result
-                * fl::tile(fl::sum(rbyg, {dim}, /* keepDims = */ true), tiledims);
-            inputs[0].addGrad(Variable(gradSm.astype(inputs[0].type()), false));
-        };
+        Variable const& gradOutput
+    ) {
+        auto rbyg = gradOutput.tensor() * result;
+        auto gradSm = rbyg
+            - result
+            * fl::tile(fl::sum(rbyg, {dim}, /* keepDims = */ true), tiledims);
+        inputs[0].addGrad(Variable(gradSm.asType(inputs[0].type()), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable logSoftmax(const Variable& input, const int dim) {
+Variable logSoftmax(Variable const& input, int const dim) {
     Tensor inputArr = FL_ADJUST_INPUT_TYPE(input.tensor());
     auto maxvals = amax(inputArr, {dim}, /* keepDims = */ true);
     // TODO{fl::Tensor}{rewrite}
@@ -1077,7 +1140,8 @@ Variable logSoftmax(const Variable& input, const int dim) {
                 fl::sum(
                     fl::exp(inputArr - fl::tile(maxvals, tiledims)),
                     {dim},
-                    /* keepDims = */ true
+                    /* keepDims = */
+                    true
                 )
             )
             + maxvals,
@@ -1087,28 +1151,29 @@ Variable logSoftmax(const Variable& input, const int dim) {
     fl::eval(result);
     auto gradFunc = [dim, tiledims, result](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto gradLsm = gradOutput.tensor()
-                - fl::exp(result)
-                * fl::tile(
-                    fl::sum(gradOutput.tensor(), {dim}, /* keepDims = */ true),
-                    tiledims
-                );
-            inputs[0].addGrad(Variable(gradLsm.astype(inputs[0].type()), false));
-        };
+        Variable const& gradOutput
+    ) {
+        auto gradLsm = gradOutput.tensor()
+            - fl::exp(result)
+            * fl::tile(
+                fl::sum(gradOutput.tensor(), {dim}, /* keepDims = */ true),
+                tiledims
+            );
+        inputs[0].addGrad(Variable(gradLsm.asType(inputs[0].type()), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable binaryCrossEntropy(const Variable& inputs, const Variable& targets) {
-    auto targetsTyped = targets.astype(inputs.type());
+Variable binaryCrossEntropy(Variable const& inputs, Variable const& targets) {
+    auto targetsTyped = targets.asType(inputs.type());
     return negate(
         targetsTyped * log(inputs) + (1 - targetsTyped) * log(1 - inputs)
     );
 }
 
 Variable categoricalCrossEntropy(
-    const Variable& in,
-    const Variable& targets,
+    Variable const& in,
+    Variable const& targets,
     ReduceMode reduction /* =ReduceMode::MEAN */,
     int ignoreIndex /* = -1 */
 ) {
@@ -1129,7 +1194,7 @@ Variable categoricalCrossEntropy(
     int C = input.dim(0);
     int X = targets.elements();
     if(
-        fl::any(
+        fl::any_of(
             ((targets.tensor() < 0) || (targets.tensor() >= C))
             && (targets.tensor() != ignoreIndex)
         )
@@ -1143,7 +1208,7 @@ Variable categoricalCrossEntropy(
     auto x = fl::reshape(input.tensor(), Shape({C, X}));
     auto y = fl::reshape(targets.tensor(), Shape({1, X}));
 
-    auto A = fl::arange(Shape({C, X}));
+    auto A = fl::arrange(Shape({C, X}));
     auto B = fl::tile(y, Shape({C}));
     auto mask = -(A == B); // [C X]
 
@@ -1155,12 +1220,15 @@ Variable categoricalCrossEntropy(
     Tensor denominator;
     if(reduction == ReduceMode::NONE) {
         result = fl::reshape(result, targets.shape()); // [X1 X2 X3]
-    } else if(reduction == ReduceMode::MEAN) {
-        denominator = fl::sum((!ignoreMask).astype(fl::dtype::s32), {0});
+    }
+    else if(reduction == ReduceMode::MEAN) {
+        denominator = fl::sum((!ignoreMask).asType(fl::dtype::s32), {0});
         result = fl::sum(result, {0}) / denominator; // [1]
-    } else if(reduction == ReduceMode::SUM) {
+    }
+    else if(reduction == ReduceMode::SUM) {
         result = fl::sum(result, {0}); // [1]
-    } else
+    }
+    else
         throw std::invalid_argument(
             "unknown reduction method for categorical cross entropy"
         );
@@ -1168,28 +1236,29 @@ Variable categoricalCrossEntropy(
     auto inputDims = input.shape();
     auto gradFunc = [C, X, mask, ignoreMask, denominator, reduction, inputDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            Tensor grad = gradOutput.tensor();
-            if(reduction == ReduceMode::NONE)
-                grad = fl::reshape(grad, {X});
-            else if(reduction == ReduceMode::MEAN)
-                grad = fl::tile(grad / denominator, {X});
-            else if(reduction == ReduceMode::SUM)
-                grad = fl::tile(grad, {X});
-            // [1 X]
-            grad(ignoreMask) = 0.;
-            grad = fl::reshape(grad, {1, X});
-            grad = fl::tile(grad, {C}) * mask;
-            inputs[0].addGrad(Variable(fl::reshape(grad, inputDims), false));
-        };
+        Variable const& gradOutput
+    ) {
+        Tensor grad = gradOutput.tensor();
+        if(reduction == ReduceMode::NONE)
+            grad = fl::reshape(grad, {X});
+        else if(reduction == ReduceMode::MEAN)
+            grad = fl::tile(grad / denominator, {X});
+        else if(reduction == ReduceMode::SUM)
+            grad = fl::tile(grad, {X});
+        // [1 X]
+        grad(ignoreMask) = 0.;
+        grad = fl::reshape(grad, {1, X});
+        grad = fl::tile(grad, {C}) * mask;
+        inputs[0].addGrad(Variable(fl::reshape(grad, inputDims), false));
+    };
 
     return Variable(result, {input.withoutData(), targets}, gradFunc);
 }
 
 Variable weightedCategoricalCrossEntropy(
-    const Variable& input,
-    const Variable& targets,
-    const Variable& weight,
+    Variable const& input,
+    Variable const& targets,
+    Variable const& weight,
     int ignoreIndex /* = -1 */
 ) {
     // input -- [C, X1, X2, X3]
@@ -1213,7 +1282,7 @@ Variable weightedCategoricalCrossEntropy(
     int C = input.dim(0);
     int X = targets.elements();
     if(
-        fl::any((targets.tensor() < 0) || (targets.tensor() >= C))
+        fl::any_of((targets.tensor() < 0) || (targets.tensor() >= C))
         .scalar<char>()
     )
         throw std::invalid_argument(
@@ -1224,7 +1293,7 @@ Variable weightedCategoricalCrossEntropy(
     auto x = fl::reshape(input.tensor(), {C, X});
     auto y = fl::reshape(targets.tensor(), {1, X});
 
-    auto A = fl::arange({C, X});
+    auto A = fl::arrange({C, X});
     auto B = fl::tile(y, {C});
     auto mask = -(A == B); // [C X]
 
@@ -1234,29 +1303,30 @@ Variable weightedCategoricalCrossEntropy(
     auto result = mask * x;
     result = result * weight.tensor();
 
-    auto ignoreMask = (y != ignoreIndex).astype(fl::dtype::s32); // [1, X]
+    auto ignoreMask = (y != ignoreIndex).asType(fl::dtype::s32); // [1, X]
     result = ignoreMask * fl::sum(result, {0}, /* keepDims = */ true); // [1, X]
     result = fl::sum(result, {1}, /* keepDims = */ true) / denominator.tensor();
 
     auto inputDims = input.shape();
     auto gradFunc = [C, X, mask, ignoreMask, denominator, inputDims](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto grad = gradOutput.tensor();
-            grad = fl::tile(grad / denominator.tensor(), {1, X});
-
-            auto weightTensor = inputs[2].tensor();
-            grad *= ignoreMask;
-            grad = fl::tile(grad, {C}) * mask;
-            grad = fl::reshape(grad, inputDims);
-            grad = grad * weightTensor;
-            inputs[0].addGrad(Variable(fl::reshape(grad, inputDims), false));
-        };
+        Variable const& gradOutput
+    ) {
+        auto grad = gradOutput.tensor();
+        grad = fl::tile(grad / denominator.tensor(), {1, X});
+
+        auto weightTensor = inputs[2].tensor();
+        grad *= ignoreMask;
+        grad = fl::tile(grad, {C}) * mask;
+        grad = fl::reshape(grad, inputDims);
+        grad = grad * weightTensor;
+        inputs[0].addGrad(Variable(fl::reshape(grad, inputDims), false));
+    };
 
     return Variable(result, {input.withoutData(), targets, weight}, gradFunc);
 }
 
-Variable reorder(const Variable& input, const Shape& shape) {
+Variable reorder(Variable const& input, Shape const& shape) {
     auto result = fl::transpose(input.tensor(), shape);
     if(!result.isContiguous())
         result = result.asContiguousTensor();
@@ -1268,24 +1338,24 @@ Variable reorder(const Variable& input, const Shape& shape) {
     std::sort(dimGrad.begin(), dimGrad.end());
 
     auto gradFunc =
-        [dimGrad](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            Shape reordered(std::vector<Dim>(dimGrad.size()));
-            for(unsigned i = 0; i < dimGrad.size(); ++i)
-                reordered[i] = dimGrad[i].second;
+        [dimGrad](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        Shape reordered(std::vector<Dim>(dimGrad.size()));
+        for(unsigned i = 0; i < dimGrad.size(); ++i)
+            reordered[i] = dimGrad[i].second;
 
-            inputs[0].addGrad(
-                Variable(fl::transpose(gradOutput.tensor(), reordered), false)
-            );
-        };
+        inputs[0].addGrad(
+            Variable(fl::transpose(gradOutput.tensor(), reordered), false)
+        );
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable linear(const Variable& input, const Variable& weight) {
-    auto dummyBias = Variable(Tensor().astype(input.type()), false);
+Variable linear(Variable const& input, Variable const& weight) {
+    auto dummyBias = Variable(Tensor().asType(input.type()), false);
     return linear(input, weight, dummyBias);
 }
 
-Variable linear(const Variable& in, const Variable& wt, const Variable& bs) {
+Variable linear(Variable const& in, Variable const& wt, Variable const& bs) {
     FL_VARIABLE_DTYPES_MATCH_CHECK(in, wt, bs);
     auto input = FL_ADJUST_INPUT_TYPE(in);
     auto weight = FL_ADJUST_INPUT_TYPE(wt);
@@ -1307,42 +1377,43 @@ Variable linear(const Variable& in, const Variable& wt, const Variable& bs) {
 
     auto gradFunc = [hasBias](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto& in = inputs[0];
-            auto& wt = inputs[1];
-            Tensor wtTensor = wt.tensor();
-            Tensor gradOutputTensor = gradOutput.tensor();
-
-            auto nframes = in.elements() / in.dim(0);
-
-            if(hasBias && inputs[2].isCalcGrad()) {
-                auto& bs = inputs[2];
-                auto biasGrad = sumAs(gradOutput, bs).tensor();
-                bs.addGrad(Variable(biasGrad, false));
-            }
-            if(in.isCalcGrad()) {
-                Shape to2dout({wtTensor.dim(0), nframes});
-                auto inGrad =
-                    moddims(matmulTN(wt, moddims(gradOutput, to2dout)), in.shape())
-                    .tensor();
-                in.addGrad(Variable(inGrad, false));
-            }
-            if(wt.isCalcGrad()) {
-                Shape to2din({wtTensor.dim(1), nframes});
-                Shape to2dout({wtTensor.dim(0), nframes});
-                auto wtGrad =
-                    matmulNT(moddims(gradOutput, to2dout), moddims(in, to2din)).tensor();
-                wt.addGrad(Variable(wtGrad, false));
-            }
-        };
+        Variable const& gradOutput
+    ) {
+        auto& in = inputs[0];
+        auto& wt = inputs[1];
+        Tensor wtTensor = wt.tensor();
+        Tensor gradOutputTensor = gradOutput.tensor();
+
+        auto nframes = in.elements() / in.dim(0);
+
+        if(hasBias && inputs[2].isCalcGrad()) {
+            auto& bs = inputs[2];
+            auto biasGrad = sumAs(gradOutput, bs).tensor();
+            bs.addGrad(Variable(biasGrad, false));
+        }
+        if(in.isCalcGrad()) {
+            Shape to2dout({wtTensor.dim(0), nframes});
+            auto inGrad =
+                moddims(matmulTN(wt, moddims(gradOutput, to2dout)), in.shape())
+                .tensor();
+            in.addGrad(Variable(inGrad, false));
+        }
+        if(wt.isCalcGrad()) {
+            Shape to2din({wtTensor.dim(1), nframes});
+            Shape to2dout({wtTensor.dim(0), nframes});
+            auto wtGrad =
+                matmulNT(moddims(gradOutput, to2dout), moddims(in, to2din)).tensor();
+            wt.addGrad(Variable(wtGrad, false));
+        }
+    };
     if(hasBias)
         return Variable(output, {input, weight, bias}, gradFunc);
     return Variable(output, {input, weight}, gradFunc);
 }
 
 Variable conv2d(
-    const Variable& input,
-    const Variable& weights,
+    Variable const& input,
+    Variable const& weights,
     int sx,
     int sy,
     int px,
@@ -1369,9 +1440,9 @@ Variable conv2d(
 }
 
 Variable conv2d(
-    const Variable& in,
-    const Variable& wt,
-    const Variable& bs,
+    Variable const& in,
+    Variable const& wt,
+    Variable const& bs,
     int sx,
     int sy,
     int px,
@@ -1407,103 +1478,105 @@ Variable conv2d(
 
     auto gradFunc =
         [sx, sy, px, py, dx, dy, hasBias, groups, benchmarks, payload](
-            std::vector<Variable>& inputs, const Variable& gradOutput) {
-            // Create benchmarks if needed
-            auto& autogradExtension =
-                inputs[0].tensor().backend().getExtension<AutogradExtension>();
-
-            std::shared_ptr<DynamicBenchmark> dataBench;
-            std::shared_ptr<DynamicBenchmark> filterBench;
-            std::shared_ptr<DynamicBenchmark> biasBench;
-            if(benchmarks && DynamicBenchmark::getBenchmarkMode()) {
-                if(!benchmarks->bwdFilterBenchmark) {
-                    benchmarks->bwdFilterBenchmark =
-                        autogradExtension.createBenchmarkOptions();
-                    filterBench = benchmarks->bwdFilterBenchmark;
-                }
-                if(!benchmarks->bwdDataBenchmark) {
-                    benchmarks->bwdDataBenchmark =
-                        autogradExtension.createBenchmarkOptions();
-                    dataBench = benchmarks->bwdDataBenchmark;
-                }
-                if(!benchmarks->bwdBiasBenchmark) {
-                    benchmarks->bwdBiasBenchmark =
-                        autogradExtension.createBenchmarkOptions();
-                    biasBench = benchmarks->bwdBiasBenchmark;
-                }
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        // Create benchmarks if needed
+        auto& autogradExtension =
+            inputs[0].tensor().backend().getExtension<AutogradExtension>();
+
+        std::shared_ptr<DynamicBenchmark> dataBench;
+        std::shared_ptr<DynamicBenchmark> filterBench;
+        std::shared_ptr<DynamicBenchmark> biasBench;
+        if(benchmarks && DynamicBenchmark::getBenchmarkMode()) {
+            if(!benchmarks->bwdFilterBenchmark) {
+                benchmarks->bwdFilterBenchmark =
+                    autogradExtension.createBenchmarkOptions();
+                filterBench = benchmarks->bwdFilterBenchmark;
             }
-
-            // Bias gradients
-            Tensor bs;
-            const bool computeBiasGrad =
-                inputs.size() > 2 && inputs[2].isCalcGrad();
-            if(hasBias && computeBiasGrad) {
-                bs = inputs[2].tensor();
-                // auto biasGrad =
-                // bs.backend().getExtension<AutogradExtension>().conv2dBackwardBias(
-                // gradOutput.tensor(), bs, biasBench, payload);
-
-                // inputs[2].addGrad(Variable(biasGrad, false)); // bias
+            if(!benchmarks->bwdDataBenchmark) {
+                benchmarks->bwdDataBenchmark =
+                    autogradExtension.createBenchmarkOptions();
+                dataBench = benchmarks->bwdDataBenchmark;
             }
-
-            auto& in = inputs[0].tensor();
-            auto& wt = inputs[1].tensor();
-
-            // Data (input) gradients
-            if(inputs[0].isCalcGrad()) {
-                auto dataGrad =
-                    in.backend().getExtension<AutogradExtension>().conv2dBackwardData(
-                        gradOutput.tensor(),
-                        in,
-                        wt,
-                        sx,
-                        sy,
-                        px,
-                        py,
-                        dx,
-                        dy,
-                        groups,
-                        dataBench,
-                        payload
-                    );
-
-                inputs[0].addGrad(Variable(dataGrad, false)); // input/data
+            if(!benchmarks->bwdBiasBenchmark) {
+                benchmarks->bwdBiasBenchmark =
+                    autogradExtension.createBenchmarkOptions();
+                biasBench = benchmarks->bwdBiasBenchmark;
             }
+        }
+
+        // Bias gradients
+        Tensor bs;
+        bool const computeBiasGrad =
+            inputs.size() > 2 && inputs[2].isCalcGrad();
+        if(hasBias && computeBiasGrad) {
+            bs = inputs[2].tensor();
+            // auto biasGrad =
+            // bs.backend().getExtension<AutogradExtension>().conv2dBackwardBias(
+            // gradOutput.tensor(), bs, biasBench, payload);
+
+            // inputs[2].addGrad(Variable(biasGrad, false)); // bias
+        }
+
+        auto& in = inputs[0].tensor();
+        auto& wt = inputs[1].tensor();
+
+        // Data (input) gradients
+        if(inputs[0].isCalcGrad()) {
+            auto dataGrad =
+                in.backend().getExtension<AutogradExtension>().conv2dBackwardData(
+                    gradOutput.tensor(),
+                    in,
+                    wt,
+                    sx,
+                    sy,
+                    px,
+                    py,
+                    dx,
+                    dy,
+                    groups,
+                    dataBench,
+                    payload
+                );
 
-            // Filter (weight) and bias gradients
-            if(inputs[1].isCalcGrad() || computeBiasGrad) {
-                auto [filterGrad, biasGrad] = wt.backend()
-                    .getExtension<AutogradExtension>()
-                    .conv2dBackwardFilterBias(
-                        gradOutput.tensor(),
-                        in,
-                        wt,
-                        bs,
-                        sx,
-                        sy,
-                        px,
-                        py,
-                        dx,
-                        dy,
-                        groups,
-                        filterBench,
-                        biasBench,
-                        payload
-                    );
-                if(inputs[1].isCalcGrad()) {
-                    inputs[1].addGrad(Variable(filterGrad, false)); // filter/weight
-                }
-                if(computeBiasGrad)
-                    inputs[2].addGrad(Variable(biasGrad, false));
+            inputs[0].addGrad(Variable(dataGrad, false)); // input/data
+        }
+
+        // Filter (weight) and bias gradients
+        if(inputs[1].isCalcGrad() || computeBiasGrad) {
+            auto [filterGrad, biasGrad] = wt.backend()
+                                            .getExtension<AutogradExtension>()
+                                            .conv2dBackwardFilterBias(
+                                                gradOutput.tensor(),
+                                                in,
+                                                wt,
+                                                bs,
+                                                sx,
+                                                sy,
+                                                px,
+                                                py,
+                                                dx,
+                                                dy,
+                                                groups,
+                                                filterBench,
+                                                biasBench,
+                                                payload
+                                            );
+            if(inputs[1].isCalcGrad()) {
+                inputs[1].addGrad(Variable(filterGrad, false)); // filter/weight
             }
-        };
+            if(computeBiasGrad)
+                inputs[2].addGrad(Variable(biasGrad, false));
+        }
+    };
     if(hasBias)
         return Variable(output, {input, weights, bias}, gradFunc);
     return Variable(output, {input, weights}, gradFunc);
 }
 
 Variable pool2d(
-    const Variable& input,
+    Variable const& input,
     int wx,
     int wy,
     int sx,
@@ -1518,40 +1591,41 @@ Variable pool2d(
 
     auto gradFunc = [wx, wy, sx, sy, px, py, mode, output, payload](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto& in = inputs[0];
-            if(!in.isCalcGrad())
-                return;
+        Variable const& gradOutput
+    ) {
+        auto& in = inputs[0];
+        if(!in.isCalcGrad())
+            return;
 
-            in.addGrad(
-                Variable(
-                    in.tensor().backend().getExtension<AutogradExtension>().pool2dBackward(
-                        gradOutput.tensor(),
-                        in.tensor(),
-                        output,
-                        wx,
-                        wy,
-                        sx,
-                        sy,
-                        px,
-                        py,
-                        mode,
-                        payload
-                    ),
-                    false
-                )
-            );
-        };
+        in.addGrad(
+            Variable(
+                in.tensor().backend().getExtension<AutogradExtension>().pool2dBackward(
+                    gradOutput.tensor(),
+                    in.tensor(),
+                    output,
+                    wx,
+                    wy,
+                    sx,
+                    sy,
+                    px,
+                    py,
+                    mode,
+                    payload
+                ),
+                false
+            )
+        );
+    };
     return Variable(output, {input}, gradFunc);
 }
 
 Variable batchnorm(
-    const Variable& _input,
-    const Variable& weight,
-    const Variable& bias,
+    Variable const& _input,
+    Variable const& weight,
+    Variable const& bias,
     Variable& runningMean,
     Variable& runningVar,
-    const std::vector<int>& axes,
+    std::vector<int> const& axes,
     bool train,
     double momentum,
     double epsilon
@@ -1581,41 +1655,41 @@ Variable batchnorm(
             train,
             axes,
             epsilon,
-            payload](std::vector<Variable>& inputs, const Variable& _gradOutput) {
-            auto& in = inputs[0];
-            auto& wt = inputs[1];
-            auto& bs = inputs[2];
-
-            auto gradOutput = detail::adjustInputType(_gradOutput, "batchnorm");
-
-            if(!in.isCalcGrad() && !wt.isCalcGrad() && !bs.isCalcGrad())
-                return;
-
-            auto [gradIn, gradWt, gradBs] =
-                in.tensor()
-                .backend()
-                .getExtension<AutogradExtension>()
-                .batchnormBackward(
-                    gradOutput.tensor(),
-                    saveMean,
-                    saveVar,
-                    detail::adjustInputType(in.tensor(), "batchnorm"),
-                    wt.tensor(),
-                    axes,
-                    train,
-                    epsilon,
-                    payload
-                );
-
-            in.addGrad(Variable(gradIn.astype(in.type()), false));
-            wt.addGrad(Variable(gradWt.astype(wt.type()), false));
-            if(!bs.isEmpty())
-                bs.addGrad(Variable(gradBs.astype(bs.type()), false));
-        };
+            payload](std::vector<Variable>& inputs, Variable const& _gradOutput) {
+        auto& in = inputs[0];
+        auto& wt = inputs[1];
+        auto& bs = inputs[2];
+
+        auto gradOutput = detail::adjustInputType(_gradOutput, "batchnorm");
+
+        if(!in.isCalcGrad() && !wt.isCalcGrad() && !bs.isCalcGrad())
+            return;
+
+        auto [gradIn, gradWt, gradBs] =
+            in.tensor()
+              .backend()
+              .getExtension<AutogradExtension>()
+              .batchnormBackward(
+                  gradOutput.tensor(),
+                  saveMean,
+                  saveVar,
+                  detail::adjustInputType(in.tensor(), "batchnorm"),
+                  wt.tensor(),
+                  axes,
+                  train,
+                  epsilon,
+                  payload
+              );
+
+        in.addGrad(Variable(gradIn.asType(in.type()), false));
+        wt.addGrad(Variable(gradWt.asType(wt.type()), false));
+        if(!bs.isEmpty())
+            bs.addGrad(Variable(gradBs.asType(bs.type()), false));
+    };
     return Variable(output, {input, weight, bias}, gradFunc);
 }
 
-Variable gatedlinearunit(const Variable& input, const int dim) {
+Variable gatedlinearunit(Variable const& input, int const dim) {
     if(dim >= input.ndim())
         throw std::invalid_argument(
             "gatedlinearunit - passed dim is great than the "
@@ -1643,21 +1717,22 @@ Variable gatedlinearunit(const Variable& input, const int dim) {
 
     auto gradFunc = [fhalf, shalf, fhalfout, shalfout, inDims, inType](
         std::vector<Variable>& inputs,
-        const Variable& gradOutput) {
-            auto gradGlu = Tensor(inDims, inType);
-            gradGlu(fhalf) = shalfout * gradOutput.tensor();
-            gradGlu(shalf) =
-                shalfout * (1.0 - shalfout) * fhalfout * gradOutput.tensor();
-            inputs[0].addGrad(Variable(gradGlu, false));
-        };
+        Variable const& gradOutput
+    ) {
+        auto gradGlu = Tensor(inDims, inType);
+        gradGlu(fhalf) = shalfout * gradOutput.tensor();
+        gradGlu(shalf) =
+            shalfout * (1.0 - shalfout) * fhalfout * gradOutput.tensor();
+        inputs[0].addGrad(Variable(gradGlu, false));
+    };
     return Variable(fhalfout * shalfout, {input.withoutData()}, gradFunc);
 }
 
 std::tuple<Variable, Variable, Variable> rnn(
-    const Variable& input,
-    const Variable& hiddenState,
-    const Variable& cellState,
-    const Variable& weights,
+    Variable const& input,
+    Variable const& hiddenState,
+    Variable const& cellState,
+    Variable const& weights,
     int hiddenSize,
     int numLayers,
     RnnMode mode,
@@ -1691,63 +1766,65 @@ std::tuple<Variable, Variable, Variable> rnn(
             dropProb,
             gradData,
             payload](
-            std::vector<Variable>& inputs,
-            const Variable& /* gradOutput */) {
-            auto& input = inputs[0];
-            auto& hiddenState = inputs[1];
-            auto& cellState = inputs[2];
-            auto& weights = inputs[3];
-
-            if(
-                !(input.isCalcGrad() || hiddenState.isCalcGrad()
+        std::vector<Variable>& inputs,
+        Variable const& /* gradOutput */
+    
+    ) {
+        auto& input = inputs[0];
+        auto& hiddenState = inputs[1];
+        auto& cellState = inputs[2];
+        auto& weights = inputs[3];
+
+        if(
+            !(input.isCalcGrad() || hiddenState.isCalcGrad()
                 || cellState.isCalcGrad() || weights.isCalcGrad())
-            )
-                return;
-
-            auto [dy, dhy, dcy, dweights] =
-                input.tensor().backend().getExtension<AutogradExtension>().rnnBackward(
-                    input.tensor(),
-                    hiddenState.tensor(),
-                    cellState.tensor(),
-                    weights.tensor(),
-                    gradData,
-                    output,
-                    numLayers,
-                    hiddenSize,
-                    mode,
-                    bidirectional,
-                    dropProb,
-                    payload
-                );
+        )
+            return;
+
+        auto [dy, dhy, dcy, dweights] =
+            input.tensor().backend().getExtension<AutogradExtension>().rnnBackward(
+                input.tensor(),
+                hiddenState.tensor(),
+                cellState.tensor(),
+                weights.tensor(),
+                gradData,
+                output,
+                numLayers,
+                hiddenSize,
+                mode,
+                bidirectional,
+                dropProb,
+                payload
+            );
 
-            input.addGrad(Variable(dy.astype(input.type()), false));
-            hiddenState.addGrad(Variable(dhy.astype(hiddenState.type()), false));
-            cellState.addGrad(Variable(dcy.astype(cellState.type()), false));
-            weights.addGrad(Variable(dweights.astype(weights.type()), false));
-        };
+        input.addGrad(Variable(dy.asType(input.type()), false));
+        hiddenState.addGrad(Variable(dhy.asType(hiddenState.type()), false));
+        cellState.addGrad(Variable(dcy.asType(cellState.type()), false));
+        weights.addGrad(Variable(dweights.asType(weights.type()), false));
+    };
 
     Variable dummy(Tensor(), {input, hiddenState, cellState, weights}, gradFunc);
 
     auto dyGradFunc =
-        [gradData](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            if(!inputs[0].isGradAvailable())
-                inputs[0].addGrad(Variable(Tensor(), false));
-            gradData->dy = gradOutput.tensor().asContiguousTensor();
-        };
+        [gradData](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        if(!inputs[0].isGradAvailable())
+            inputs[0].addGrad(Variable(Tensor(), false));
+        gradData->dy = gradOutput.tensor().asContiguousTensor();
+    };
 
     auto dhyGradFunc =
-        [gradData](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            if(!inputs[0].isGradAvailable())
-                inputs[0].addGrad(Variable(Tensor(), false));
-            gradData->dhy = gradOutput.tensor().asContiguousTensor();
-        };
+        [gradData](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        if(!inputs[0].isGradAvailable())
+            inputs[0].addGrad(Variable(Tensor(), false));
+        gradData->dhy = gradOutput.tensor().asContiguousTensor();
+    };
 
     auto dcyGradFunc =
-        [gradData](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            if(!inputs[0].isGradAvailable())
-                inputs[0].addGrad(Variable(Tensor(), false));
-            gradData->dcy = gradOutput.tensor().asContiguousTensor();
-        };
+        [gradData](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        if(!inputs[0].isGradAvailable())
+            inputs[0].addGrad(Variable(Tensor(), false));
+        gradData->dcy = gradOutput.tensor().asContiguousTensor();
+    };
 
     Variable yv(output, {dummy}, dyGradFunc); // output
     Variable hyv(hiddenOut, {dummy}, dhyGradFunc); // hidden state output
@@ -1755,63 +1832,67 @@ std::tuple<Variable, Variable, Variable> rnn(
     return std::make_tuple(yv, hyv, cyv);
 }
 
-Variable embedding(const Variable& input, const Variable& embeddings) {
+Variable embedding(Variable const& input, Variable const& embeddings) {
     // TODO{fl::Tensor}{4-dims} - relax this
     if(input.ndim() >= 4)
-        throw std::invalid_argument("embedding input must have 3 or fewer dims");
+        throw std::invalid_argument{"embedding input must have 3 or fewer dims"};
 
-    auto idxs = input.tensor().flatten();
+    auto const idxs = input.tensor().flatten();
     auto inDims = input.shape();
     std::vector<Dim> rDims(input.ndim() + 1);
     rDims[0] = embeddings.dim(0);
-    for(unsigned i = 1; i < input.ndim() + 1; i++)
+    for(Dim i = 1; i < input.ndim() + 1; i++)
         rDims[i] = inDims[i - 1];
-    Shape resultDims(rDims);
-    Tensor result = fl::reshape(embeddings.tensor()(fl::span, idxs), resultDims);
-
-            auto gradFunc = [](std::vector<Variable>& inputs,
-                const Variable& gradOutput) {
-                    auto& w = inputs[1];
-                    if(!w.isCalcGrad())
-                        return;
-
-                    auto ip = inputs[0].tensor().flatten();
-                    unsigned size = ip.elements();
-                    auto deltas = fl::reshape(gradOutput.tensor(), {w.dim(0), size});
-
-                    // Sparse Tensor
-                    auto sp = Tensor(
-                        ip.elements(),
-                        w.dim(1),
-                        fl::full({size}, 1, deltas.type()),
-                        fl::arange({size + 1}, 0, fl::dtype::s32),
-                        ip.astype(fl::dtype::s32),
-                        fl::StorageType::CSR
-                    );
-
-                    auto grad = transpose(
-                        fl::matmul(
-                            sp,
-                            transpose(deltas), /* lhsProp = */
-                            MatrixProperty::Transpose
-                        )
-                    );
-                    w.addGrad(Variable(grad, false));
-                };
-
-            return Variable(result, {input, embeddings}, gradFunc);
+
+    Shape const resultDims{rDims};
+    auto const result = fl::reshape(embeddings.tensor()(fl::span, idxs), resultDims);
+
+    auto grad_func = [](
+        std::vector<Variable>& inputs,
+        Variable const& gradOutput
+    ) {
+        auto& w = inputs[1];
+        if(!w.isCalcGrad())
+            return;
+
+        auto const ip = inputs[0].tensor().flatten();
+        auto size = static_cast<Dim>(ip.elements());
+        auto const deltas = fl::reshape(gradOutput.tensor(), {w.dim(0), size});
+
+        // Sparse Tensor
+        auto const sp = Tensor{
+            size,
+            w.dim(1),
+            fl::full({size}, 1.0, deltas.type()),
+            fl::arrange({size + 1}, 0, fl::dtype::s32),
+            ip.asType(fl::dtype::s32),
+            fl::StorageType::CSR
+        };
+
+        auto const grad = transpose(
+            fl::matmul(
+                sp,
+                transpose(deltas),
+                /* lhsProp = */
+                MatrixProperty::Transpose
+            )
+        );
+        w.addGrad(Variable{grad, false});
+    };
+
+    return Variable{result, {input, embeddings}, grad_func};
 }
 
 Variable padding(
-    const Variable& input,
+    Variable const& input,
     std::vector<std::pair<int, int>> pad,
     double val
 ) {
     if(pad.size() > input.ndim())
-        throw std::invalid_argument(
+        throw std::invalid_argument{
             "padding: number of padding dimensions exceeds number "
             "of input dimensions"
-        );
+        };
 
     Shape opDims = input.shape();
     std::vector<fl::Index> inSeq(input.ndim(), fl::span);
@@ -1823,33 +1904,34 @@ Variable padding(
     result(inSeq) = input.tensor();
 
     auto gradFunc =
-        [inSeq](std::vector<Variable>& inputs, const Variable& gradOutput) {
-            inputs[0].addGrad(Variable(gradOutput.tensor()(inSeq), false));
-        };
+        [inSeq](std::vector<Variable>& inputs, Variable const& gradOutput) {
+        inputs[0].addGrad(Variable(gradOutput.tensor()(inSeq), false));
+    };
     return Variable(result, {input.withoutData()}, gradFunc);
 }
 
-Variable dropout(const Variable& input, double p) {
+Variable dropout(Variable const& input, double p) {
     if(p > 0.0) {
         auto mask = Variable(
-            (fl::rand(input.shape(), input.type()) > p).astype(input.type()),
+            (fl::rand(input.shape(), input.type()) > p).asType(input.type()),
             false
         );
         return 1.0 / (1.0 - p) * mask * input;
-    } else
+    }
+    else
         return input;
 }
 
-Variable relu(const Variable& input) { return max(input, 0.0); }
+Variable relu(Variable const& input) { return max(input, 0.0); }
 
-Variable gelu(const Variable& in) {
+Variable gelu(Variable const& in) {
     auto input = FL_ADJUST_INPUT_TYPE(in);
     return 0.5 * input
-           * (1.0
-               + fl::tanh(0.7978845608 * (input + 0.044715 * input * input * input)));
+        * (1.0
+            + fl::tanh(0.7978845608 * (input + 0.044715 * input * input * input)));
 }
 
-fl::Variable relativePositionEmbeddingRotate(const fl::Variable& input) {
+fl::Variable relativePositionEmbeddingRotate(fl::Variable const& input) {
     if(input.ndim() != 3)
         throw std::invalid_argument(
             "relativePositionEmbeddingRotate - "
@@ -1870,31 +1952,32 @@ fl::Variable relativePositionEmbeddingRotate(const fl::Variable& input) {
     data = fl::reshape(data, {d0 + d1 - 1, d1, d2});
     auto gradFunc = [d0, d1, d2](
         std::vector<fl::Variable>& inputs,
-        const fl::Variable& gradOutput) {
-            auto gradData = gradOutput.tensor();
-            gradData = fl::reshape(gradData, {(d0 + d1 - 1) * d1, 1, d2});
-            gradData = fl::concatenate(
-                0,
-                gradData,
-                fl::full({d1, 1, d2}, 0.0, gradData.type())
-            );
-            gradData = reshape(gradData, {d0 + d1, d1, d2});
-            gradData = Variable(gradData, false)(fl::range(0, d0)).tensor();
-            inputs[0].addGrad(fl::Variable(gradData, false));
-        };
+        fl::Variable const& gradOutput
+    ) {
+        auto gradData = gradOutput.tensor();
+        gradData = fl::reshape(gradData, {(d0 + d1 - 1) * d1, 1, d2});
+        gradData = fl::concatenate(
+            0,
+            gradData,
+            fl::full({d1, 1, d2}, 0.0, gradData.type())
+        );
+        gradData = reshape(gradData, {d0 + d1, d1, d2});
+        gradData = Variable(gradData, false)(fl::range(0, d0)).tensor();
+        inputs[0].addGrad(fl::Variable(gradData, false));
+    };
     return fl::Variable(data, {input}, gradFunc);
 }
 
 fl::Variable multiheadAttention(
-    const fl::Variable& query,
-    const fl::Variable& key,
-    const fl::Variable& value,
-    const fl::Variable& posEmb,
-    const fl::Variable& mask,
-    const fl::Variable& padMask,
-    const int32_t nHeads,
-    const double pDropout,
-    const int32_t offset /* = 0 */
+    fl::Variable const& query,
+    fl::Variable const& key,
+    fl::Variable const& value,
+    fl::Variable const& posEmb,
+    fl::Variable const& mask,
+    fl::Variable const& padMask,
+    int32_t const nHeads,
+    double const pDropout,
+    int32_t const offset /* = 0 */
 ) {
     if(query.ndim() != 3)
         throw std::invalid_argument(
@@ -1925,12 +2008,12 @@ fl::Variable multiheadAttention(
     if(!posEmb.isEmpty()) {
         int n = posEmb.dim(0) / 2 - offset;
         auto pscores =
-            relativePositionEmbeddingRotate(matmulNT(posEmb.astype(q.type()), q));
+            relativePositionEmbeddingRotate(matmulNT(posEmb.asType(q.type()), q));
         scores =
             scores + transpose(pscores(fl::range(n, n + k.dim(0))), {1, 0, 2});
     }
     if(!mask.isEmpty())
-        scores = scores + tileAs(mask.astype(scores.type()), scores);
+        scores = scores + tileAs(mask.asType(scores.type()), scores);
     if(!padMask.isEmpty()) {
         if(padMask.dim(0) != query.dim(0))
             throw std::invalid_argument(
@@ -1941,13 +2024,13 @@ fl::Variable multiheadAttention(
             tileAs(padMaskTile, {padMask.dim(0), padMask.dim(0), nHeads, bsz});
         scores = scores
             + moddims(
-                padMaskTile.astype(scores.type()),
+                padMaskTile.asType(scores.type()),
                 {padMask.dim(0), padMask.dim(0), nHeads * bsz}
             );
     }
 
     auto attn = dropout(softmax(scores, 1), pDropout);
-    auto result = matmul(attn.astype(v.type()), v);
+    auto result = matmul(attn.asType(v.type()), v);
     result = moddims(result, {-1, headDim * nHeads, bsz});
     return result;
 }
diff --git a/flashlight/fl/autograd/Functions.h b/flashlight/fl/autograd/Functions.h
index b2d23a0..e6423ab 100644
--- a/flashlight/fl/autograd/Functions.h
+++ b/flashlight/fl/autograd/Functions.h
@@ -71,11 +71,11 @@ namespace detail {
             && optimLevel != OptimLevel::DEFAULT
         )
             // Not in the excluded list - cast to f16
-            res = in.astype(fl::dtype::f16);
+            res = in.asType(fl::dtype::f16);
         else {
             // Upcast to f32 only if we have an f16 input - otherwise, leave as is
             if(in.type() == fl::dtype::f16)
-                res = in.astype(fl::dtype::f32);
+                res = in.asType(fl::dtype::f32);
             else
                 res = in;
         }
@@ -449,7 +449,7 @@ FL_API Variable concatenate(const std::vector<Variable>& concatInputs, int dim);
  * divisible, last chunk of smaller splitSize will be included.
  * @param dim dimension along which to split the Variable
  */
-FL_API std::vector<Variable> split(const Variable& input, long splitSize, int dim);
+FL_API std::vector<Variable> split(const Variable& input, int64_t splitSize, int dim);
 
 /**
  * Splits a Variable into smaller chunks.
@@ -458,7 +458,7 @@ FL_API std::vector<Variable> split(const Variable& input, long splitSize, int di
  * @param splitSizes vector of integers specifying the sizes for each split
  * @param dim dimension along which to split the Variable
  */
-FL_API std::vector<Variable> split(const Variable& input, const std::vector<long>& splitSizes, int dim);
+FL_API std::vector<Variable> split(const Variable& input, std::vector<int64_t> const& splitSizes, int dim);
 
 /**
  * Repeats the tensor `input` along specific dimensions. The number of
diff --git a/flashlight/fl/autograd/Variable.cpp b/flashlight/fl/autograd/Variable.cpp
index bd1fb6a..08e1914 100644
--- a/flashlight/fl/autograd/Variable.cpp
+++ b/flashlight/fl/autograd/Variable.cpp
@@ -95,13 +95,13 @@ Variable Variable::copy() const {
     return Variable(sharedData_->data, sharedGrad_->calcGrad);
 }
 
-Variable Variable::astype(fl::dtype newType) const {
-    auto output = tensor().astype(newType);
+Variable Variable::asType(fl::dtype newType) const {
+    auto output = tensor().asType(newType);
     auto gradFunc = [](std::vector<Variable>& inputs,
         const Variable& gradOutput) {
             auto& input = inputs[0];
             // Cast the grad output to match the type of the input's grad
-            input.addGrad(Variable(gradOutput.tensor().astype(input.type()), false));
+            input.addGrad(Variable(gradOutput.tensor().asType(input.type()), false));
         };
     return Variable(output, {this->withoutData()}, gradFunc);
 }
diff --git a/flashlight/fl/autograd/Variable.h b/flashlight/fl/autograd/Variable.h
index 60fb040..2ce498d 100644
--- a/flashlight/fl/autograd/Variable.h
+++ b/flashlight/fl/autograd/Variable.h
@@ -128,7 +128,12 @@ class FL_API Variable {
      *
      * @return returns the casted variable.
      */
-    Variable astype(fl::dtype type) const;
+    Variable asType(fl::dtype type) const;
+
+    /**
+     * @deprecated use @ref Variable::asType(fl::dtype) const instead
+     */
+    Variable astype(fl::dtype type) const { return asType(type); }
 
     /**
      * @return a reference to the underlying gradient Variable.
@@ -207,25 +212,19 @@ class FL_API Variable {
      * Must eventually be freed manually via `free` or a related call.
      */
     template<typename T>
-    T* host() const {
-        return tensor().host<T>();
-    }
+    T* host() const { return tensor().host<T>(); }
 
     /**
      * Copies the array to the existing host pointer `ptr`
      */
     template<typename T>
-    void host(T* ptr) const {
-        tensor().host(ptr);
-    }
+    void host(T* ptr) const { tensor().host(ptr); }
 
     /**
      * Get the first element of the array as a scalar
      */
     template<typename T>
-    T scalar() const {
-        return tensor().scalar<T>();
-    }
+    T scalar() const { return tensor().scalar<T>(); }
 
     /**
      * Remove the gradient stored by the Variable
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp
index 9f6b315..bb4e7ad 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/BatchNorm.cpp
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h"
@@ -48,15 +48,15 @@ namespace {
 
         if(minAxis == 0) {
             modeOut = CUDNN_BATCHNORM_PER_ACTIVATION;
-            inDescDimsOut = Shape(
+            inDescDimsOut = Shape{
                 {
                     1,
                     1,
                     nfeatures,
-                    static_cast<long long>(input.elements() / nfeatures)
+                    static_cast<int64_t>(input.elements() / nfeatures)
                 }
-            );
-            wtDescDimsOut = Shape({1, 1, nfeatures});
+            };
+            wtDescDimsOut = Shape{1, 1, nfeatures};
         } else {
             modeOut = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7003
@@ -67,15 +67,15 @@ namespace {
             int batchsz = 1;
             for(int i = maxAxis + 1; i < input.ndim(); ++i)
                 batchsz *= input.dim(i);
-            inDescDimsOut = Shape(
+            inDescDimsOut = Shape{
                 {
                     1,
-                    static_cast<long long>(input.elements() / (nfeatures * batchsz)),
+                    static_cast<int64_t>(input.elements() / (nfeatures * batchsz)),
                     nfeatures,
                     batchsz,
                 }
-            );
-            wtDescDimsOut = Shape({1, 1, nfeatures});
+            };
+            wtDescDimsOut = Shape{1, 1, nfeatures};
         }
     }
 
@@ -101,7 +101,7 @@ Tensor CudnnAutogradExtension::batchnorm(
         );
     FL_TENSOR_DTYPES_MATCH_CHECK(weight, bias, runningMean, runningVar);
 
-    auto output = Tensor(input.shape(), input.type());
+    auto output = Tensor{input.shape(), input.type()};
 
     cudnnBatchNormMode_t mode;
     Shape inDescDims, wtDescDims;
@@ -115,15 +115,15 @@ Tensor CudnnAutogradExtension::batchnorm(
     // Weight, bias, and running mean/var arrays can't be fp16 (must be fp32)
     Tensor weightArray = weight.isEmpty()
         ? fl::full(wtDescDims, 1.0, fl::dtype::f32)
-        : weight.astype(fl::dtype::f32);
+        : weight.asType(fl::dtype::f32);
     Tensor biasArray = bias.isEmpty() ? fl::full(wtDescDims, 0.0, fl::dtype::f32)
-        : bias.astype(fl::dtype::f32);
+        : bias.asType(fl::dtype::f32);
 
     fl::dtype scalarsType =
         input.type() == fl::dtype::f16 ? fl::dtype::f32 : input.type();
 
-    auto inDesc = TensorDescriptor(input.type(), inDescDims);
-    auto wtDesc = TensorDescriptor(weightArray.type(), wtDescDims);
+    auto inDesc = TensorDescriptor{input.type(), inDescDims};
+    auto wtDesc = TensorDescriptor{weightArray.type(), wtDescDims};
 
     {
         DevicePtr inRaw(input);
@@ -140,8 +140,8 @@ Tensor CudnnAutogradExtension::batchnorm(
         );
 
         if(train) {
-            saveMean = Tensor({wtDescDims[2]}, scalarsType);
-            saveVar = Tensor({wtDescDims[2]}, scalarsType);
+            saveMean = Tensor{{wtDescDims[2]}, scalarsType};
+            saveVar = Tensor{{wtDescDims[2]}, scalarsType};
 
             DevicePtr saveMeanRaw(saveMean);
             DevicePtr saveVarRaw(saveVar);
@@ -153,11 +153,11 @@ Tensor CudnnAutogradExtension::batchnorm(
                     mode,
                     kOne(scalarsType),
                     kZero(scalarsType),
-                    inDesc.descriptor,
+                    inDesc.get(),
                     inRaw.get(),
-                    inDesc.descriptor,
+                    inDesc.get(),
                     outRaw.get(),
-                    wtDesc.descriptor,
+                    wtDesc.get(),
                     wtRaw.get(),
                     bsRaw.get(),
                     momentum,
@@ -175,11 +175,11 @@ Tensor CudnnAutogradExtension::batchnorm(
                     mode,
                     kOne(scalarsType),
                     kZero(scalarsType),
-                    inDesc.descriptor,
+                    inDesc.get(),
                     inRaw.get(),
-                    inDesc.descriptor,
+                    inDesc.get(),
                     outRaw.get(),
-                    wtDesc.descriptor,
+                    wtDesc.get(),
                     wtRaw.get(),
                     bsRaw.get(),
                     runMeanRaw.get(),
@@ -223,13 +223,13 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::batchnormBackward(
     const void* one1 = kOne(scalarsType);
     const void* zero0 = kZero(scalarsType);
 
-    auto iDesc = TensorDescriptor(input.type(), inDescDims);
-    auto wDesc = TensorDescriptor(wt.type(), wtDescDims);
+    auto iDesc = TensorDescriptor{input.type(), inDescDims};
+    auto wDesc = TensorDescriptor{wt.type(), wtDescDims};
     // CuDNN doesn't support calculating only the gradients
     // required for batchnorm
-    auto gradIn = Tensor(input.shape(), input.type());
-    auto gradWt = Tensor(wt.shape(), wt.type());
-    auto gradBs = Tensor(wt.shape(), wt.type());
+    auto gradIn = Tensor{input.shape(), input.type()};
+    auto gradWt = Tensor{wt.shape(), wt.type()};
+    auto gradBs = Tensor{wt.shape(), wt.type()};
     {
         DevicePtr iRaw(input);
         DevicePtr wRaw(wt);
@@ -257,13 +257,13 @@ std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::batchnormBackward(
                 zero0,
                 one1,
                 zero0,
-                iDesc.descriptor,
+                iDesc.get(),
                 iRaw.get(),
-                iDesc.descriptor,
+                iDesc.get(),
                 gradOpRaw.get(),
-                iDesc.descriptor,
+                iDesc.get(),
                 gradInRaw.get(),
-                wDesc.descriptor,
+                wDesc.get(),
                 wRaw.get(),
                 gradWtRaw.get(),
                 gradBsRaw.get(),
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CMakeLists.txt b/flashlight/fl/autograd/tensor/backend/cudnn/CMakeLists.txt
index 49660c9..0bb7eba 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CMakeLists.txt
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CMakeLists.txt
@@ -8,6 +8,8 @@ target_sources(
   ${CMAKE_CURRENT_LIST_DIR}/Conv2D.cpp
   ${CMAKE_CURRENT_LIST_DIR}/CudnnUtils.h
   ${CMAKE_CURRENT_LIST_DIR}/CudnnUtils.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/CudnnRnnUtils.h
+  ${CMAKE_CURRENT_LIST_DIR}/CudnnRnnUtils.cpp
   ${CMAKE_CURRENT_LIST_DIR}/Pool2D.cpp
   ${CMAKE_CURRENT_LIST_DIR}/RNN.cpp
   )
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp
index bb89e61..e6560e5 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/Conv2D.cpp
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h"
@@ -270,7 +270,7 @@ namespace {
     ) {
         CUDNN_CHECK_ERR(
             cudnnSetConvolutionMathType(
-                cDesc.descriptor,
+                cDesc.get(),
                 kKernelModesToCudnnMathType.at(kernelOptions->currentOption())
             )
         );
@@ -280,13 +280,13 @@ namespace {
         if(input.type() == fl::dtype::f16)
             CUDNN_CHECK_ERR(
                 cudnnSetConvolutionMathType(
-                    cDesc.descriptor,
+                    cDesc.get(),
                     CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
                 )
             );
         else
             CUDNN_CHECK_ERR(
-                cudnnSetConvolutionMathType(cDesc.descriptor, CUDNN_DEFAULT_MATH)
+                cudnnSetConvolutionMathType(cDesc.get(), CUDNN_DEFAULT_MATH)
             );
     }
 
@@ -314,42 +314,42 @@ Tensor CudnnAutogradExtension::conv2d(
 
     auto hasBias = bias.elements() > 0;
 
-    auto inDesc = TensorDescriptor(input);
-    auto wtDesc = FilterDescriptor(weights);
-    auto convDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
+    auto inDesc = TensorDescriptor{input};
+    auto wtDesc = FilterDescriptor{weights};
+    auto convDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
     if(input.type() == fl::dtype::f16)
         CUDNN_CHECK_ERR(
             cudnnSetConvolutionMathType(
-                convDesc.descriptor,
+                convDesc.get(),
                 CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
             )
         );
     else
         CUDNN_CHECK_ERR(
-            cudnnSetConvolutionMathType(convDesc.descriptor, CUDNN_DEFAULT_MATH)
+            cudnnSetConvolutionMathType(convDesc.get(), CUDNN_DEFAULT_MATH)
         );
 
     std::array<int, 4> odims;
     CUDNN_CHECK_ERR(
         cudnnGetConvolutionNdForwardOutputDim(
-            convDesc.descriptor,
-            inDesc.descriptor,
-            wtDesc.descriptor,
+            convDesc.get(),
+            inDesc.get(),
+            wtDesc.get(),
             4,
             odims.data()
         )
     );
-    auto output = Tensor({odims[3], odims[2], odims[1], odims[0]}, input.type());
-    auto outDesc = TensorDescriptor(output);
+    auto output = Tensor{{odims[3], odims[2], odims[1], odims[0]}, input.type()};
+    auto outDesc = TensorDescriptor{output};
 
     auto handle = getCudnnHandle();
     const auto& cudnnStream = getCudnnStream();
 
     auto fwdAlgoBestPerf = getFwdAlgo(
-        inDesc.descriptor,
-        wtDesc.descriptor,
-        convDesc.descriptor,
-        outDesc.descriptor,
+        inDesc.get(),
+        wtDesc.get(),
+        convDesc.get(),
+        outDesc.get(),
         input.type()
     );
 
@@ -357,22 +357,22 @@ Tensor CudnnAutogradExtension::conv2d(
 
     try {
         wspace =
-            Tensor({static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8);
+            Tensor{{static_cast<int64_t>(fwdAlgoBestPerf.memory)}, fl::dtype::b8};
     } catch(const std::exception&) {
         fwdAlgoBestPerf.algo = kFwdDefaultAlgo;
         CUDNN_CHECK_ERR(
             cudnnGetConvolutionForwardWorkspaceSize(
                 handle,
-                inDesc.descriptor,
-                wtDesc.descriptor,
-                convDesc.descriptor,
-                outDesc.descriptor,
+                inDesc.get(),
+                wtDesc.get(),
+                convDesc.get(),
+                outDesc.get(),
                 fwdAlgoBestPerf.algo,
                 &fwdAlgoBestPerf.memory
             )
         );
         wspace =
-            Tensor({static_cast<long long>(fwdAlgoBestPerf.memory)}, fl::dtype::b8);
+            Tensor{{static_cast<int64_t>(fwdAlgoBestPerf.memory)}, fl::dtype::b8};
     }
     {
         DevicePtr inPtr(input);
@@ -390,22 +390,22 @@ Tensor CudnnAutogradExtension::conv2d(
             cudnnConvolutionForward(
                 handle,
                 one,
-                inDesc.descriptor,
+                inDesc.get(),
                 inPtr.get(),
-                wtDesc.descriptor,
+                wtDesc.get(),
                 wtPtr.get(),
-                convDesc.descriptor,
+                convDesc.get(),
                 fwdAlgoBestPerf.algo,
                 wspacePtr.get(),
                 fwdAlgoBestPerf.memory,
                 zero,
-                outDesc.descriptor,
+                outDesc.get(),
                 outPtr.get()
             )
         );
 
         if(hasBias) {
-            auto bsDesc = TensorDescriptor(bias);
+            auto bsDesc = TensorDescriptor{bias};
             DevicePtr bsPtr(bias);
             // ensure cudnn compute stream waits on stream of bias tensor
             relativeSync(cudnnStream, {bias});
@@ -413,10 +413,10 @@ Tensor CudnnAutogradExtension::conv2d(
                 cudnnAddTensor(
                     handle,
                     one,
-                    bsDesc.descriptor,
+                    bsDesc.get(),
                     bsPtr.get(),
                     one,
-                    outDesc.descriptor,
+                    outDesc.get(),
                     outPtr.get()
                 )
             );
@@ -453,10 +453,10 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
     // benchmarking suggests input or weight casting should occur, these
     // descriptors may not be used/new ones with the correct types will be
     // used instead.
-    auto iDesc = TensorDescriptor(input);
-    auto wDesc = FilterDescriptor(weight);
-    auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
-    auto oDesc = TensorDescriptor(gradOutput);
+    auto iDesc = TensorDescriptor{input};
+    auto wDesc = FilterDescriptor{weight};
+    auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
+    auto oDesc = TensorDescriptor{gradOutput};
 
     setDefaultMathType(cDesc, input);
 
@@ -481,40 +481,40 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
             relativeSync(cudnnStream, {wtTensor});
             bool isStrided = (dx * dy) > 1;
             auto bwdDataAlgoBestPerf = getBwdDataAlgo(
-                iDesc.descriptor,
-                wDesc.descriptor,
-                cDesc.descriptor,
-                oDesc.descriptor,
+                iDesc.get(),
+                wDesc.get(),
+                cDesc.get(),
+                oDesc.get(),
                 isStrided,
                 inTensor.type()
             );
 
             Tensor ws;
             try {
-                ws = Tensor(
-                    {static_cast<long long>(bwdDataAlgoBestPerf.memory)},
+                ws = Tensor{
+                    {static_cast<int64_t>(bwdDataAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             } catch(const std::exception&) {
                 bwdDataAlgoBestPerf.algo = kBwdDataDefaultAlgo;
                 CUDNN_CHECK_ERR(
                     cudnnGetConvolutionBackwardDataWorkspaceSize(
                         hndl,
-                        wDesc.descriptor,
-                        oDesc.descriptor,
-                        cDesc.descriptor,
-                        iDesc.descriptor,
+                        wDesc.get(),
+                        oDesc.get(),
+                        cDesc.get(),
+                        iDesc.get(),
                         bwdDataAlgoBestPerf.algo,
                         &bwdDataAlgoBestPerf.memory
                     )
                 );
-                ws = Tensor(
-                    {static_cast<long long>(bwdDataAlgoBestPerf.memory)},
+                ws = Tensor{
+                    {static_cast<int64_t>(bwdDataAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             }
 
-            auto gradInput = Tensor(inTensor.shape(), inTensor.type());
+            auto gradInput = Tensor{inTensor.shape(), inTensor.type()};
             {
                 DevicePtr gradInputPtr(gradInput);
                 DevicePtr gradResultPtr(gradOutputTensor);
@@ -525,16 +525,16 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
                     cudnnConvolutionBackwardData(
                         hndl,
                         oneg,
-                        wDesc.descriptor,
+                        wDesc.get(),
                         wPtr.get(),
-                        oDesc.descriptor,
+                        oDesc.get(),
                         gradResultPtr.get(),
-                        cDesc.descriptor,
+                        cDesc.get(),
                         bwdDataAlgoBestPerf.algo,
                         wsPtr.get(),
                         bwdDataAlgoBestPerf.memory,
                         zerog,
-                        iDesc.descriptor,
+                        iDesc.get(),
                         gradInputPtr.get()
                     )
                 );
@@ -570,18 +570,18 @@ Tensor CudnnAutogradExtension::conv2dBackwardData(
                 &wtTensorF32,
                 &gradOutput,
                 &gradOutputTensorF32]() {
-                    inTensorF32 = input.astype(fl::dtype::f32);
-                    wtTensorF32 = weight.astype(fl::dtype::f32);
-                    gradOutputTensorF32 = gradOutput.astype(fl::dtype::f32);
+                    inTensorF32 = input.asType(fl::dtype::f32);
+                    wtTensorF32 = weight.asType(fl::dtype::f32);
+                    gradOutputTensorF32 = gradOutput.asType(fl::dtype::f32);
                 },
                 /* incrementCount = */ false
             );
 
-            auto iDescF32 = TensorDescriptor(inTensorF32);
-            auto wDescF32 = FilterDescriptor(wtTensorF32);
+            auto iDescF32 = TensorDescriptor{inTensorF32};
+            auto wDescF32 = FilterDescriptor{wtTensorF32};
             auto cDescF32 =
-                ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups);
-            auto oDescF32 = TensorDescriptor(gradOutputTensorF32);
+                ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups};
+            auto oDescF32 = TensorDescriptor{gradOutputTensorF32};
             // core bwd data computation
             dataGradBenchmark->audit(
                 [&dataGradOut,
@@ -671,10 +671,10 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
     // benchmarking suggests input or weight casting should occur, these
     // descriptors may not be used/new ones with the correct types will be
     // used instead.
-    auto iDesc = TensorDescriptor(input);
-    auto wDesc = FilterDescriptor(weight);
-    auto cDesc = ConvDescriptor(input.type(), px, py, sx, sy, dx, dy, groups);
-    auto oDesc = TensorDescriptor(gradOutput);
+    auto iDesc = TensorDescriptor{input};
+    auto wDesc = FilterDescriptor{weight};
+    auto cDesc = ConvDescriptor{input.type(), px, py, sx, sy, dx, dy, groups};
+    auto oDesc = TensorDescriptor{gradOutput};
 
     setDefaultMathType(cDesc, input);
 
@@ -699,39 +699,39 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
             // ensure cudnn compute stream waits on stream of input tensor
             relativeSync(cudnnStream, {inTensor});
             auto bwdFilterAlgoBestPerf = getBwdFilterAlgo(
-                iDesc.descriptor,
-                wDesc.descriptor,
-                cDesc.descriptor,
-                oDesc.descriptor,
+                iDesc.get(),
+                wDesc.get(),
+                cDesc.get(),
+                oDesc.get(),
                 inTensor.type()
             );
 
             Tensor ws;
             try {
-                ws = Tensor(
-                    {static_cast<long long>(bwdFilterAlgoBestPerf.memory)},
+                ws = Tensor{
+                    {static_cast<int64_t>(bwdFilterAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             } catch(const std::exception&) {
                 bwdFilterAlgoBestPerf.algo = kBwdFilterDefaultAlgo;
                 CUDNN_CHECK_ERR(
                     cudnnGetConvolutionBackwardFilterWorkspaceSize(
                         hndl,
-                        iDesc.descriptor,
-                        oDesc.descriptor,
-                        cDesc.descriptor,
-                        wDesc.descriptor,
+                        iDesc.get(),
+                        oDesc.get(),
+                        cDesc.get(),
+                        wDesc.get(),
                         bwdFilterAlgoBestPerf.algo,
                         &bwdFilterAlgoBestPerf.memory
                     )
                 );
-                ws = Tensor(
-                    {static_cast<long long>(bwdFilterAlgoBestPerf.memory)},
+                ws = Tensor{
+                    {static_cast<int64_t>(bwdFilterAlgoBestPerf.memory)},
                     fl::dtype::b8
-                );
+                };
             }
 
-            auto gradWeight = Tensor(wtTensor.shape(), wtTensor.type());
+            auto gradWeight = Tensor{wtTensor.shape(), wtTensor.type()};
             {
                 DevicePtr gradWeightPtr(gradWeight);
                 DevicePtr gradResultPtr(gradOutputTensor);
@@ -742,16 +742,16 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
                     cudnnConvolutionBackwardFilter(
                         hndl,
                         oneg,
-                        iDesc.descriptor,
+                        iDesc.get(),
                         iPtr.get(),
-                        oDesc.descriptor,
+                        oDesc.get(),
                         gradResultPtr.get(),
-                        cDesc.descriptor,
+                        cDesc.get(),
                         bwdFilterAlgoBestPerf.algo,
                         wsPtr.get(),
                         bwdFilterAlgoBestPerf.memory,
                         zerog,
-                        wDesc.descriptor,
+                        wDesc.get(),
                         gradWeightPtr.get()
                     )
                 );
@@ -787,18 +787,18 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
                 &wtTensorF32,
                 &gradOutput,
                 &gradOutputTensorF32]() {
-                    inTensorF32 = input.astype(fl::dtype::f32);
-                    wtTensorF32 = weight.astype(fl::dtype::f32);
-                    gradOutputTensorF32 = gradOutput.astype(fl::dtype::f32);
+                    inTensorF32 = input.asType(fl::dtype::f32);
+                    wtTensorF32 = weight.asType(fl::dtype::f32);
+                    gradOutputTensorF32 = gradOutput.asType(fl::dtype::f32);
                 },
                 /* incrementCount = */ false
             );
 
-            auto iDescF32 = TensorDescriptor(inTensorF32);
-            auto wDescF32 = FilterDescriptor(wtTensorF32);
+            auto iDescF32 = TensorDescriptor{inTensorF32};
+            auto wDescF32 = FilterDescriptor{wtTensorF32};
             auto cDescF32 =
-                ConvDescriptor(fl::dtype::f32, px, py, sx, sy, dx, dy, groups);
-            auto oDescF32 = TensorDescriptor(gradOutputTensorF32);
+                ConvDescriptor{fl::dtype::f32, px, py, sx, sy, dx, dy, groups};
+            auto oDescF32 = TensorDescriptor{gradOutputTensorF32};
             // core bwd data computation
             filterGradBenchmark->audit(
                 [&filterGradOut,
@@ -860,21 +860,21 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
         const Tensor& bsTensor,
         const Tensor& gradOutput,
         const TensorDescriptor& oDesc) -> Tensor {
-            auto gradBias = Tensor(bsTensor.shape(), bsTensor.type());
+            auto gradBias = Tensor{bsTensor.shape(), bsTensor.type()};
             {
                 DevicePtr gradBiasPtr(gradBias);
                 DevicePtr gradResultPtr(gradOutput);
                 // ensure cudnn compute stream waits on gradient tensor streams
                 relativeSync(cudnnStream, {gradOutput, gradBias});
-                auto bDesc = TensorDescriptor(bsTensor);
+                auto bDesc = TensorDescriptor{bsTensor};
                 CUDNN_CHECK_ERR(
                     cudnnConvolutionBackwardBias(
                         hndl,
                         oneg,
-                        oDesc.descriptor,
+                        oDesc.get(),
                         gradResultPtr.get(),
                         zerog,
-                        bDesc.descriptor,
+                        bDesc.get(),
                         gradBiasPtr.get()
                     )
                 );
@@ -906,12 +906,12 @@ std::pair<Tensor, Tensor> CudnnAutogradExtension::conv2dBackwardFilterBias(
                 // Time cast bias and grad output if benchmarking
                 biasGradBenchmark->audit(
                     [&bias, &gradOutput, &biasF32, &gradOutputF32]() {
-                        biasF32 = bias.astype(fl::dtype::f32);
-                        gradOutputF32 = gradOutput.astype(fl::dtype::f32);
+                        biasF32 = bias.asType(fl::dtype::f32);
+                        gradOutputF32 = gradOutput.asType(fl::dtype::f32);
                     },
                     /* incrementCount = */ false
                 );
-                auto oDescF32 = TensorDescriptor(gradOutputF32);
+                auto oDescF32 = TensorDescriptor{gradOutputF32};
                 // Perform bias gradient computation
                 biasGradBenchmark->audit(
                     [&biasGradOut,
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp
index 305a6cc..eaac140 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.cpp
@@ -16,13 +16,11 @@ namespace fl {
 std::shared_ptr<fl::DynamicBenchmark> CudnnAutogradExtension::createBenchmarkOptions() {
     return std::make_shared<fl::DynamicBenchmark>(
         std::make_shared<fl::DynamicBenchmarkOptions<KernelMode>>(
-            std::vector<KernelMode>(
-                {
-                    KernelMode::F32,
-                    KernelMode::F32_ALLOW_CONVERSION,
-                    KernelMode::F16
-                }
-            ),
+            std::vector<KernelMode>{
+                KernelMode::F32,
+                KernelMode::F32_ALLOW_CONVERSION,
+                KernelMode::F16
+            },
             fl::kDynamicBenchmarkDefaultCount
         )
     );
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h
index b960c30..2edfd89 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #pragma once
@@ -19,96 +19,96 @@ class CudnnAutogradExtension : public AutogradExtension {
     // TODO(jacobkahn): implement getCudnnHandle
 
 public:
-    bool isDataTypeSupported(const fl::dtype& dtype) const override;
+    bool isDataTypeSupported(fl::dtype const& dtype) const override;
 
-    enum class KernelMode {F32 = 0, F32_ALLOW_CONVERSION = 1, F16 = 2};
+    enum class KernelMode { F32 = 0, F32_ALLOW_CONVERSION = 1, F16 = 2 };
 
     std::shared_ptr<fl::DynamicBenchmark> createBenchmarkOptions() override;
 
     /**************************** Forward ****************************/
     Tensor conv2d(
-        const Tensor& input,
-        const Tensor& weights,
-        const Tensor& bias,
-        const int sx,
-        const int sy,
-        const int px,
-        const int py,
-        const int dx,
-        const int dy,
-        const int groups,
+        Tensor const& input,
+        Tensor const& weights,
+        Tensor const& bias,
+        int sx,
+        int sy,
+        int px,
+        int py,
+        int dx,
+        int dy,
+        int groups,
         std::shared_ptr<detail::AutogradPayload> payload
     ) override;
 
     Tensor pool2d(
-        const Tensor& input,
-        const int wx,
-        const int wy,
-        const int sx,
-        const int sy,
-        const int px,
-        const int py,
-        const PoolingMode mode,
+        Tensor const& input,
+        int wx,
+        int wy,
+        int sx,
+        int sy,
+        int px,
+        int py,
+        PoolingMode mode,
         std::shared_ptr<detail::AutogradPayload> payload
     ) override;
 
     Tensor batchnorm(
         Tensor& saveMean,
         Tensor& saveVar,
-        const Tensor& input,
-        const Tensor& weight,
-        const Tensor& bias,
+        Tensor const& input,
+        Tensor const& weight,
+        Tensor const& bias,
         Tensor& runningMean,
         Tensor& runningVar,
-        const std::vector<int>& axes,
-        const bool train,
-        const double momentum,
-        const double epsilon,
+        std::vector<int> const& axes,
+        bool train,
+        double momentum,
+        double epsilon,
         std::shared_ptr<detail::AutogradPayload> payload
     ) override;
 
     std::tuple<Tensor, Tensor, Tensor> rnn(
-        const Tensor& input,
-        const Tensor& hiddenState,
-        const Tensor& cellState,
-        const Tensor& weights,
-        const int hiddenSize,
-        const int numLayers,
-        const RnnMode mode,
-        const bool bidirectional,
-        const float dropout,
-        std::shared_ptr<detail::AutogradPayload> payload
+        Tensor const& input,
+        Tensor const& hiddenState,
+        Tensor const& cellState,
+        Tensor const& weights,
+        int hiddenSize,
+        int numLayers,
+        RnnMode mode,
+        bool bidirectional,
+        float dropProb,
+        std::shared_ptr<detail::AutogradPayload> autogradPayload
     ) override;
 
     /**************************** Backward ****************************/
     // ]----- Convolution
     Tensor conv2dBackwardData(
-        const Tensor& gradOutput,
-        const Tensor& input,
-        const Tensor& weight,
-        const int sx,
-        const int sy,
-        const int px,
-        const int py,
-        const int dx,
-        const int dy,
-        const int groups,
+        Tensor const& gradOutput,
+        Tensor const& input,
+        Tensor const& weight,
+        int sx,
+        int sy,
+        int px,
+        int py,
+        int dx,
+        int dy,
+        int groups,
         std::shared_ptr<DynamicBenchmark> dataGradBenchmark,
         std::shared_ptr<detail::AutogradPayload> payload
     ) override;
 
     std::pair<Tensor, Tensor> conv2dBackwardFilterBias(
-        const Tensor& gradOutput,
-        const Tensor& input,
-        const Tensor& weights,
-        const Tensor& bias,
-        const int sx,
-        const int sy,
-        const int px,
-        const int py,
-        const int dx,
-        const int dy,
-        const int groups,
+        Tensor const& gradOutput,
+        Tensor const& input,
+        Tensor const& weights,
+        Tensor const& bias,
+        int sx,
+        int sy,
+        int px,
+        int py,
+        int dx,
+        int dy,
+        int groups,
         std::shared_ptr<DynamicBenchmark> filterBench,
         std::shared_ptr<DynamicBenchmark> biasBench,
         std::shared_ptr<detail::AutogradPayload> autogradPayload
@@ -116,47 +116,59 @@ class CudnnAutogradExtension : public AutogradExtension {
 
     // ]----- pool2D
     Tensor pool2dBackward(
-        const Tensor& gradOutput,
-        const Tensor& input,
-        const Tensor& poolOutput,
-        const int wx,
-        const int wy,
-        const int sx,
-        const int sy,
-        const int px,
-        const int py,
-        const PoolingMode mode,
+        Tensor const& gradOutput,
+        Tensor const& input,
+        Tensor const& poolOutput,
+        int wx,
+        int wy,
+        int sx,
+        int sy,
+        int px,
+        int py,
+        PoolingMode mode,
         std::shared_ptr<detail::AutogradPayload> payload
     ) override;
 
     // ]----- batchnorm
     std::tuple<Tensor, Tensor, Tensor> batchnormBackward(
-        const Tensor& gradOutput,
-        const Tensor& saveMean,
-        const Tensor& saveVar,
-        const Tensor& input,
-        const Tensor& weight,
-        const std::vector<int>& axes,
-        const bool train,
-        const float epsilon,
+        Tensor const& gradOutput,
+        Tensor const& saveMean,
+        Tensor const& saveVar,
+        Tensor const& input,
+        Tensor const& weight,
+        std::vector<int> const& axes,
+        bool train,
+        float epsilon,
         std::shared_ptr<detail::AutogradPayload> payload
     ) override;
 
     // ]----- rnn
     std::tuple<Tensor, Tensor, Tensor, Tensor> rnnBackward(
-        const Tensor& input,
-        const Tensor& hiddenState,
-        const Tensor& cellState,
-        const Tensor& weights,
-        const std::shared_ptr<detail::RNNGradData> gradData,
-        const Tensor& output,
-        const int numLayers,
-        const int hiddenSize,
-        const RnnMode mode,
-        const bool bidirectional,
-        const float dropProb,
-        std::shared_ptr<detail::AutogradPayload> payload
+        Tensor const& input,
+        Tensor const& hiddenState,
+        Tensor const& cellState,
+        Tensor const& weights,
+        std::shared_ptr<detail::RNNGradData> gradData,
+        Tensor const& output,
+        int numLayers,
+        int hiddenSize,
+        RnnMode mode,
+        bool bidirectional,
+        float dropProb,
+        std::shared_ptr<detail::AutogradPayload> autogradPayload
     ) override;
+
+private:
+
+    static void checkHiddenStateDims(int hiddenSize, Tensor const& hiddenState, int batchSize, int totalLayers);
+    static void checkCellStateDims(
+        int hiddenSize,
+        RnnMode mode,
+        Tensor const& cellState,
+        int batchSize,
+        int totalLayers
+    );
+
 };
 
 } // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnRnnUtils.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnRnnUtils.cpp
new file mode 100644
index 0000000..c64c234
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnRnnUtils.cpp
@@ -0,0 +1,305 @@
+#include "CudnnRnnUtils.h"
+
+#include "flashlight/fl/common/DevicePtr.h"
+#include "flashlight/fl/tensor/Compute.h"
+
+
+namespace fl {
+namespace {
+    struct temp_space_sizes {
+        size_t size;
+        size_t reserveSize;
+    };
+
+    temp_space_sizes rnn_temp_space_sizes(
+        cudnnHandle_t handle,
+        RNNDescriptor const& rnnDescriptor,
+        RNNDataDescriptor const& xDescriptor,
+        cudnnForwardMode_t mode
+    ) {
+        temp_space_sizes sizes{};
+
+        CUDNN_CHECK_ERR(
+            cudnnGetRNNTempSpaceSizes(
+                handle,
+                rnnDescriptor.get(),
+                mode,
+                xDescriptor.get(),
+                &sizes.size,
+                &sizes.reserveSize
+            )
+        );
+
+        return sizes;
+    }
+
+    size_t rnn_weight_space_size(
+        cudnnHandle_t handle,
+        RNNDescriptor const& rnnDescriptor
+    ) {
+        size_t size = 0;
+
+        CUDNN_CHECK_ERR(
+            cudnnGetRNNWeightSpaceSize(handle,rnnDescriptor.get(),&size)
+        );
+        return size;
+    }
+
+    std::optional<Tensor> create_dev_seq_lengths(int batchSize, int seqLength) {
+        //see cudnn docs for cudnnRNNForward as explanation
+#if CUDNN_VERSION >= 8901
+        return std::nullopt;
+#else
+        return fl::full({batchSize}, seqLength, fl::dtype::s32);
+#endif
+    }
+
+}
+}
+
+namespace fl {
+void cudnn_rnn_forward(
+    int batchSize,
+    int seqLength,
+    bool train,
+    RNNDescriptor const& rnnDesc,
+    Tensor const& x,
+    Tensor const& y,
+    Tensor const& weights,
+    TensorDescriptor const& cxDesc,
+    TensorDescriptor const& hxDesc,
+    Tensor const& hy,
+    Tensor const& cy,
+    Tensor const& hiddenState,
+    Tensor const& cellState,
+    Tensor& reserveSpace
+) {
+    RNNDataDescriptor xDesc{x.type(), x.shape()};
+    RNNDataDescriptor yDesc{y.type(), y.shape()};
+
+    auto handle = getCudnnHandle();
+
+    size_t weightSpaceSize = rnn_weight_space_size(handle, rnnDesc);
+
+    if(weightSpaceSize != weights.bytes())
+        throw std::invalid_argument("invalid # of parameters or wrong input shape for RNN");
+
+    auto const forwardMode = train ? CUDNN_FWD_MODE_TRAINING : CUDNN_FWD_MODE_INFERENCE;
+
+    auto [workspaceSize, reserveSize] = rnn_temp_space_sizes(handle, rnnDesc, xDesc, forwardMode);
+
+    Tensor workspace({static_cast<int64_t>(workspaceSize)}, fl::dtype::b8);
+    // Space must be reused between forward and backward for cuDNN
+
+    reserveSpace = Tensor{{static_cast<int64_t>(reserveSize)}, fl::dtype::b8};
+
+    auto devSeqLengths = create_dev_seq_lengths(batchSize, seqLength);
+
+    auto const& cudnnStream = getCudnnStream();
+
+    {
+        auto contiguousX = x.asContiguousTensor();
+        auto contiguousWeights = weights.asContiguousTensor();
+        DevicePtr xRaw(contiguousX);
+        DevicePtr hxRaw(hiddenState);
+        DevicePtr cxRaw(cellState);
+        DevicePtr weightSpaceRaw(contiguousWeights);
+        DevicePtr yRaw(y);
+        DevicePtr hyRaw(hy);
+        DevicePtr cyRaw(cy);
+        DevicePtr workspaceRaw(workspace);
+        DevicePtr reserveSpaceRaw(reserveSpace);
+
+        std::optional<DevicePtr> devSeqLengthsRaw{};
+
+        if(devSeqLengths)
+            devSeqLengthsRaw.emplace(*devSeqLengths);
+
+        // ensure cudnn compute stream waits greaterThanEqual(&on input/output tensor streams
+
+        std::vector waits{
+            contiguousX,
+            hiddenState,
+            cellState,
+            contiguousWeights,
+            y,
+            hy,
+            cy,
+            workspace,
+            reserveSpace,
+        };
+        if(devSeqLengths)
+            waits.push_back(*devSeqLengths);
+
+        relativeSync(cudnnStream, waits);
+
+
+        CUDNN_CHECK_ERR(
+            cudnnRNNForward(
+                handle,
+                rnnDesc.get(),
+                forwardMode,
+                devSeqLengthsRaw ? devSeqLengthsRaw->getAs<int32_t const>() : nullptr,
+
+                xDesc.get(),
+                xRaw.get(),
+                yDesc.get(),
+                yRaw.get(),
+
+                hxDesc.get(),
+                hxRaw.get(),
+                hyRaw.get(),
+                cxDesc.get(),
+                cxRaw.get(),
+                cyRaw.get(),
+
+                weightSpaceSize,
+                weightSpaceRaw.get(),
+
+                workspaceSize,
+                workspaceRaw.get(),
+
+                reserveSize,
+                reserveSpaceRaw.get()
+            )
+        );
+    }
+
+    // ensure output tensor streams wait on cudnn compute stream
+    relativeSync({y, hy, cy}, cudnnStream);
+}
+
+void cudnn_rnn_backward(
+    int batchSize,
+    int seqLength,
+    RNNDescriptor const& rnnDesc,
+
+    Tensor const& x,
+    Tensor const& y,
+    Tensor const& dy,
+    Tensor const& weights,
+    TensorDescriptor const& cxDesc,
+    TensorDescriptor const& hxDesc,
+    Tensor const& dhy,
+    Tensor const& dcy,
+    Tensor const& hiddenState,
+    Tensor const& cellState,
+    Tensor const& dx,
+    Tensor const& dhx,
+    Tensor const& dcx,
+    Tensor const& dw,
+
+    Tensor const& reserveSpace
+) {
+    auto handle = getCudnnHandle();
+    auto const& cudnnStream = getCudnnStream();
+
+    RNNDataDescriptor xDesc{x.type(), x.shape()};
+    RNNDataDescriptor yDesc{y.type(), y.shape()};
+
+    size_t weightSpaceSize = rnn_weight_space_size(handle, rnnDesc);
+    auto [workspaceSize, reserveSize] = rnn_temp_space_sizes(handle, rnnDesc, xDesc, CUDNN_FWD_MODE_TRAINING);
+
+    Tensor workspace({static_cast<int64_t>(workspaceSize)}, fl::dtype::b8);
+
+    auto devSeqLengths = create_dev_seq_lengths(batchSize, seqLength);
+
+    std::vector<Tensor> waits = {y, workspace, reserveSpace};
+    if(devSeqLengths)
+        waits.push_back(*devSeqLengths);
+
+    // ensure cudnn compute stream waits on input/output tensor streams
+    relativeSync(cudnnStream, waits);
+
+    DevicePtr yRaw(y);
+    DevicePtr workspaceRaw(workspace);
+    DevicePtr reserveSpaceRaw(reserveSpace);
+
+    std::optional<DevicePtr> devSeqLengthsRaw{};
+    if(devSeqLengths)
+        devSeqLengthsRaw.emplace(*devSeqLengths);
+
+    {
+        DevicePtr dyRaw(dy); // Has to be set to 0 if empty
+        DevicePtr dhyRaw(dhy);
+        DevicePtr dcyRaw(dcy);
+
+        DevicePtr wRaw(weights);
+
+        DevicePtr hxRaw(hiddenState);
+        DevicePtr cxRaw(cellState);
+
+        DevicePtr dxRaw(dx);
+        DevicePtr dhxRaw(dhx);
+        DevicePtr dcxRaw(dcx);
+
+        // ensure cudnn compute stream waits on input/output tensor streams
+        relativeSync(
+            cudnnStream,
+            {dy, dhy, dcy, weights, hiddenState, cellState, dx, dhx, dcx}
+        );
+
+        /* We need to update reserveSpace even if we just want the
+         * weight gradients. */
+        CUDNN_CHECK_ERR(
+            cudnnRNNBackwardData_v8(
+                handle,
+                rnnDesc.get(),
+                devSeqLengthsRaw ? devSeqLengthsRaw->getAs<int32_t const>() : nullptr,
+                yDesc.get(),
+                yRaw.get(),
+                dyRaw.get(),
+                xDesc.get(),
+                dxRaw.get(),
+                hxDesc.get(),
+                hxRaw.get(),
+                dhyRaw.get(),
+                dhxRaw.get(),
+                cxDesc.get(),
+                cxRaw.get(),
+                dcyRaw.get(),
+                dcxRaw.get(),
+                weightSpaceSize,
+                wRaw.get(),
+                workspaceSize,
+                workspaceRaw.get(),
+                reserveSpace.bytes(),
+                reserveSpaceRaw.get()
+            )
+        );
+    }
+
+    {
+        DevicePtr xRaw(x);
+        DevicePtr dwRaw(dw);
+        DevicePtr hxRaw(hiddenState);
+
+        // ensure cudnn compute stream waits on input/output tensor streams
+        relativeSync(cudnnStream, {x, dw, hiddenState});
+
+        CUDNN_CHECK_ERR(
+            cudnnRNNBackwardWeights_v8(
+                handle,
+                rnnDesc.get(),
+                CUDNN_WGRAD_MODE_ADD,
+                devSeqLengthsRaw ? devSeqLengthsRaw->getAs<int32_t const>() : nullptr,
+                xDesc.get(),
+                xRaw.get(),
+                hxDesc.get(),
+                hxRaw.get(),
+                yDesc.get(),
+                yRaw.get(),
+                weightSpaceSize,
+                dwRaw.get(),
+                workspaceSize,
+                workspaceRaw.get(),
+                reserveSpace.bytes(),
+                reserveSpaceRaw.get()
+            )
+        );
+    }
+
+    // ensure output tensor streams wait on cudnn compute stream
+    relativeSync({dx, dhx, dcx, dw}, cudnnStream);
+}
+}
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnRnnUtils.h b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnRnnUtils.h
new file mode 100644
index 0000000..3ed5b07
--- /dev/null
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnRnnUtils.h
@@ -0,0 +1,45 @@
+#pragma once
+#include "CudnnUtils.h"
+
+namespace fl {
+void cudnn_rnn_forward(
+    int batchSize,
+    int seqLength,
+    bool train,
+    RNNDescriptor const& rnnDesc,
+
+    Tensor const& x,
+    Tensor const& y,
+    Tensor const& weights,
+    TensorDescriptor const& cxDesc,
+    TensorDescriptor const& hxDesc,
+    Tensor const& hy,
+    Tensor const& cy,
+    Tensor const& hiddenState,
+    Tensor const& cellState,
+
+    Tensor& reserveSpace // out
+);
+void cudnn_rnn_backward(
+    int batchSize,
+    int seqLength,
+    RNNDescriptor const& rnnDesc,
+
+    Tensor const& x,
+    Tensor const& y,
+    Tensor const& dy,
+    Tensor const& weights,
+    TensorDescriptor const& cxDesc,
+    TensorDescriptor const& hxDesc,
+    Tensor const& dhy,
+    Tensor const& dcy,
+    Tensor const& hiddenState,
+    Tensor const& cellState,
+    Tensor const& dx,
+    Tensor const& dhx,
+    Tensor const& dcx,
+    Tensor const& dw,
+
+    Tensor const& reserveSpace
+);
+}
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp
index 82cadcb..4a20900 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.cpp
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h"
@@ -25,16 +25,16 @@ struct DeviceHandle {
     std::shared_ptr<fl::CUDAStream> stream;
 
     explicit DeviceHandle(std::shared_ptr<fl::CUDAStream> _stream) : cudnnHandle(nullptr),
-                                                                     stream(_stream) {
+        stream(_stream) {
         CUDNN_CHECK_ERR(cudnnCreate(&cudnnHandle));
         CUDNN_CHECK_ERR(cudnnSetStream(cudnnHandle, stream->handle()));
     }
 
     ~DeviceHandle() {
         if(cudnnHandle) {
-// See https://git.io/fNQnM - sometimes, at exit, the CUDA context
-// (or something) is already destroyed by the time a handle gets destroyed
-// because of an issue with the destruction order.
+            // See https://git.io/fNQnM - sometimes, at exit, the CUDA context
+            // (or something) is already destroyed by the time a handle gets destroyed
+            // because of an issue with the destruction order.
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
             CUDNN_CHECK_ERR(cudnnDestroy(cudnnHandle));
@@ -43,16 +43,16 @@ struct DeviceHandle {
     }
 };
 
-const float kFloatZero = 0.0;
-const float kFloatOne = 1.0;
+constexpr float kFloatZero = 0.0;
+constexpr float kFloatOne = 1.0;
 
-const double kDoubleZero = 0.0;
-const double kDoubleOne = 1.0;
+constexpr double kDoubleZero = 0.0;
+constexpr double kDoubleOne = 1.0;
 
 // TODO: move this to CudnnAutogradExtension if we make it a singleton
 std::unordered_map<int, DeviceHandle> handles;
 
-const DeviceHandle& getActiveDeviceHandle() {
+DeviceHandle const& getActiveDeviceHandle() {
     auto& manager = fl::DeviceManager::getInstance();
     auto& cudaDevice =
         manager.getActiveDevice(fl::DeviceType::CUDA).impl<fl::CUDADevice>();
@@ -88,58 +88,43 @@ namespace fl {
 void cudnnCheckErr(cudnnStatus_t status) {
     if(status == CUDNN_STATUS_SUCCESS)
         return;
-    const char* err = cudnnGetErrorString(status);
+    char const* err = cudnnGetErrorString(status);
     switch(status) {
-        case CUDNN_STATUS_BAD_PARAM:
-            throw std::invalid_argument(err);
-        default:
-            throw std::runtime_error(err);
+        case CUDNN_STATUS_BAD_PARAM: throw std::invalid_argument(err);
+        default: throw std::runtime_error(err);
     }
 }
 
-cudnnDataType_t cudnnMapToType(const fl::dtype& t) {
+cudnnDataType_t cudnnMapToType(fl::dtype const& t) {
     switch(t) {
-        case fl::dtype::f16:
-            return CUDNN_DATA_HALF;
-        case fl::dtype::f32:
-            return CUDNN_DATA_FLOAT;
-        case fl::dtype::f64:
-            return CUDNN_DATA_DOUBLE;
-        default:
-            throw std::invalid_argument("unsupported data type for cuDNN");
+        case fl::dtype::f16: return CUDNN_DATA_HALF;
+        case fl::dtype::f32: return CUDNN_DATA_FLOAT;
+        case fl::dtype::f64: return CUDNN_DATA_DOUBLE;
+        default: throw std::invalid_argument("unsupported data type for cuDNN");
     }
 }
 
-cudnnPoolingMode_t cudnnMapToPoolingMode(const PoolingMode mode) {
+cudnnPoolingMode_t cudnnMapToPoolingMode(PoolingMode const mode) {
     switch(mode) {
-        case PoolingMode::MAX:
-            return CUDNN_POOLING_MAX;
-        case PoolingMode::AVG_INCLUDE_PADDING:
-            return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-        case PoolingMode::AVG_EXCLUDE_PADDING:
-            return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-        default:
-            throw std::invalid_argument("unsupported pooling mode for cuDNN");
+        case PoolingMode::MAX: return CUDNN_POOLING_MAX;
+        case PoolingMode::AVG_INCLUDE_PADDING: return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        case PoolingMode::AVG_EXCLUDE_PADDING: return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+        default: throw std::invalid_argument("unsupported pooling mode for cuDNN");
     }
 }
 
-cudnnRNNMode_t cudnnMapToRNNMode(const RnnMode mode) {
+cudnnRNNMode_t cudnnMapToRNNMode(RnnMode const mode) {
     switch(mode) {
-        case RnnMode::RELU:
-            return CUDNN_RNN_RELU;
-        case RnnMode::TANH:
-            return CUDNN_RNN_TANH;
-        case RnnMode::LSTM:
-            return CUDNN_LSTM;
-        case RnnMode::GRU:
-            return CUDNN_GRU;
-        default:
-            throw std::invalid_argument("unsupported RNN mode for cuDNN");
+        case RnnMode::RELU: return CUDNN_RNN_RELU;
+        case RnnMode::TANH: return CUDNN_RNN_TANH;
+        case RnnMode::LSTM: return CUDNN_LSTM;
+        case RnnMode::GRU: return CUDNN_GRU;
+        default: throw std::invalid_argument("unsupported RNN mode for cuDNN");
     }
 }
 
-TensorDescriptor::TensorDescriptor(const fl::dtype type, const Shape& flDims) {
-    CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&descriptor));
+TensorDescriptor::TensorDescriptor(fl::dtype const type, Shape const& flDims) {
+    CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&_handle));
     cudnnDataType_t cudnntype = cudnnMapToType(type);
 
     std::array<int, 4> dims = {1, 1, 1, 1};
@@ -156,7 +141,7 @@ TensorDescriptor::TensorDescriptor(const fl::dtype type, const Shape& flDims) {
 
     CUDNN_CHECK_ERR(
         cudnnSetTensorNdDescriptor(
-            descriptor,
+            _handle,
             cudnntype,
             dims.size(),
             dims.data(),
@@ -165,8 +150,8 @@ TensorDescriptor::TensorDescriptor(const fl::dtype type, const Shape& flDims) {
     );
 }
 
-TensorDescriptor::TensorDescriptor(const Tensor& input) {
-    CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&descriptor));
+TensorDescriptor::TensorDescriptor(Tensor const& input) {
+    CUDNN_CHECK_ERR(cudnnCreateTensorDescriptor(&_handle));
     cudnnDataType_t cudnntype = cudnnMapToType(input.type());
 
     auto flStrides = input.strides();
@@ -185,7 +170,7 @@ TensorDescriptor::TensorDescriptor(const Tensor& input) {
 
     CUDNN_CHECK_ERR(
         cudnnSetTensorNdDescriptor(
-            descriptor /* descriptor handle */,
+            _handle /* descriptor handle */,
             cudnntype /* = dataType */,
             4,
             dims.data(),
@@ -194,21 +179,19 @@ TensorDescriptor::TensorDescriptor(const Tensor& input) {
     );
 }
 
-TensorDescriptor::~TensorDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyTensorDescriptor(descriptor));
-}
+TensorDescriptor::~TensorDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyTensorDescriptor(_handle)); }
 
 TensorDescriptorArray::TensorDescriptorArray(
     int size,
-    const fl::dtype type,
-    const Shape& dims
+    fl::dtype const type,
+    Shape const& dims
 ) {
-    desc_vec.reserve(size);
+    _descVec.reserve(size);
     for(int i = 0; i < size; i++) {
-        desc_vec.emplace_back(type, dims);
-        desc_raw_vec.push_back(desc_vec.back().descriptor);
+        _descVec.emplace_back(type, dims);
+        _descRawVec.push_back(_descVec.back().get());
     }
-    descriptors = desc_raw_vec.data();
+    descriptors = _descRawVec.data();
 }
 
 TensorDescriptorArray::~TensorDescriptorArray() = default;
@@ -222,7 +205,7 @@ PoolingDescriptor::PoolingDescriptor(
     int py,
     PoolingMode mode
 ) {
-    CUDNN_CHECK_ERR(cudnnCreatePoolingDescriptor(&descriptor));
+    CUDNN_CHECK_ERR(cudnnCreatePoolingDescriptor(&_handle));
     std::array<int, 2> window = {static_cast<int>(wy), static_cast<int>(wx)};
     std::array<int, 2> padding = {static_cast<int>(py), static_cast<int>(px)};
     std::array<int, 2> stride = {static_cast<int>(sy), static_cast<int>(sx)};
@@ -230,7 +213,7 @@ PoolingDescriptor::PoolingDescriptor(
     auto cudnnpoolingmode = cudnnMapToPoolingMode(mode);
     CUDNN_CHECK_ERR(
         cudnnSetPoolingNdDescriptor(
-            descriptor,
+            _handle,
             cudnnpoolingmode,
             CUDNN_PROPAGATE_NAN,
             2,
@@ -241,12 +224,10 @@ PoolingDescriptor::PoolingDescriptor(
     );
 }
 
-PoolingDescriptor::~PoolingDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyPoolingDescriptor(descriptor));
-}
+PoolingDescriptor::~PoolingDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyPoolingDescriptor(_handle)); }
 
-FilterDescriptor::FilterDescriptor(const Tensor& input) {
-    CUDNN_CHECK_ERR(cudnnCreateFilterDescriptor(&descriptor));
+FilterDescriptor::FilterDescriptor(Tensor const& input) {
+    CUDNN_CHECK_ERR(cudnnCreateFilterDescriptor(&_handle));
     cudnnDataType_t cudnntype = cudnnMapToType(input.type());
 
     auto flDims = input.shape();
@@ -258,7 +239,7 @@ FilterDescriptor::FilterDescriptor(const Tensor& input) {
 
     CUDNN_CHECK_ERR(
         cudnnSetFilterNdDescriptor(
-            descriptor,
+            _handle,
             cudnntype,
             CUDNN_TENSOR_NCHW,
             4,
@@ -267,121 +248,144 @@ FilterDescriptor::FilterDescriptor(const Tensor& input) {
     );
 }
 
-FilterDescriptor::~FilterDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyFilterDescriptor(descriptor));
-}
+FilterDescriptor::~FilterDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyFilterDescriptor(_handle)); }
+
+DropoutDescriptor::DropoutDescriptor(float dropProb) {
+    CUDNN_CHECK_ERR(cudnnCreateDropoutDescriptor(&_handle));
+
+    auto const cudnnHandle = getCudnnHandle();
+    constexpr int64_t seed = 0;
+    size_t stateSize;
 
-DropoutDescriptor::DropoutDescriptor(float drop_prob) {
-    CUDNN_CHECK_ERR(cudnnCreateDropoutDescriptor(&descriptor));
-    auto cudnnHandle = getCudnnHandle();
-    unsigned long long seed = 0;
-    size_t state_size;
-    CUDNN_CHECK_ERR(cudnnDropoutGetStatesSize(cudnnHandle, &state_size));
-    auto& dropout_states = getDropoutStates();
-    if(dropout_states.isEmpty()) {
-        dropout_states =
-            Tensor({static_cast<long long>(state_size)}, fl::dtype::b8);
-        DevicePtr statesraw(dropout_states);
+    CUDNN_CHECK_ERR(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
+
+    auto& dropoutStates = getDropoutStates();
+
+    if(dropoutStates.isEmpty()) {
+        dropoutStates =
+            Tensor{{static_cast<Dim>(stateSize)}, fl::dtype::b8};
+        DevicePtr statesraw(dropoutStates);
         CUDNN_CHECK_ERR(
             cudnnSetDropoutDescriptor(
-                descriptor,
+                _handle,
                 cudnnHandle,
-                drop_prob,
+                dropProb,
                 statesraw.get(),
-                state_size,
+                stateSize,
                 seed
             )
         );
-    } else {
-        DevicePtr statesraw(dropout_states);
-// See https://git.io/fp9oo for an explanation.
-#if CUDNN_VERSION >= 7000
+    }
+    else {
+        DevicePtr statesraw(dropoutStates);
         CUDNN_CHECK_ERR(
             cudnnRestoreDropoutDescriptor(
-                descriptor,
+                _handle,
                 cudnnHandle,
-                drop_prob,
+                dropProb,
                 statesraw.get(),
-                state_size,
+                stateSize,
                 seed
             )
         );
-#else
-        auto dropout_struct = reinterpret_cast<CudnnDropoutStruct*>(descriptor);
-        dropout_struct->dropout = drop_prob;
-        dropout_struct->nstates = state_size;
-        dropout_struct->states = statesraw.get();
-#endif
     }
 }
 
-DropoutDescriptor::~DropoutDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyDropoutDescriptor(descriptor));
-}
+DropoutDescriptor::~DropoutDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyDropoutDescriptor(_handle)); }
 
 Tensor& DropoutDescriptor::getDropoutStates() {
-    thread_local Tensor dropout_states;
-    return dropout_states;
+    thread_local Tensor dropoutStates;
+    return dropoutStates;
 }
 
 RNNDescriptor::RNNDescriptor(
     fl::dtype type,
-    int hidden_size,
-    int num_layers,
+    int inputSize,
+    int hiddenSize,
+    int numLayers,
     RnnMode mode,
     bool bidirectional,
     DropoutDescriptor& dropout
 ) {
-    CUDNN_CHECK_ERR(cudnnCreateRNNDescriptor(&descriptor));
-
-    auto cudnnHandle = getCudnnHandle();
+    CUDNN_CHECK_ERR(cudnnCreateRNNDescriptor(&_handle));
 
-    cudnnRNNInputMode_t in_mode = CUDNN_LINEAR_INPUT;
+    constexpr auto inMode = CUDNN_LINEAR_INPUT;
+    constexpr auto algo = CUDNN_RNN_ALGO_STANDARD;
 
-    cudnnDirectionMode_t dir =
-        bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    auto const dir = bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
 
-    cudnnRNNMode_t cell = cudnnMapToRNNMode(mode);
-    cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD;
-    cudnnDataType_t cudnntype = cudnnMapToType(type);
+    auto const cell = cudnnMapToRNNMode(mode);
+    auto const dataType = cudnnMapToType(type);
 
-#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
     CUDNN_CHECK_ERR(
-        cudnnSetRNNDescriptor(
-            cudnnHandle,
-            descriptor,
-            hidden_size,
-            num_layers,
-            dropout.descriptor,
-            in_mode,
-            dir,
-            cell,
+        //https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-892/api/index.html#cudnnSetRNNDescriptor_v8
+        cudnnSetRNNDescriptor_v8(
+            _handle,
             algo,
-            cudnntype
+            cell,
+            CUDNN_RNN_DOUBLE_BIAS, //TODO review; double is default for old cudnn
+            dir,
+            inMode,
+            dataType,
+            dataType, // math precision
+            mathType(type),
+            inputSize,
+            hiddenSize,
+            hiddenSize, //projection size (unused)
+            numLayers,
+            dropout.get(),
+            0
         )
     );
-#else
+}
+
+RNNDescriptor::~RNNDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyRNNDescriptor(_handle)); }
+
+}
+
+namespace fl {
+
+
+RNNDataDescriptor::RNNDataDescriptor(fl::dtype type, Shape const& dims) {
+    create();
+
+    auto sizes = max(dims, {1, 1, 1});
+
+    auto const inputSize = static_cast<int>(sizes[0]);
+    auto const batchSize = static_cast<int>(sizes[1]);
+    auto const maxSeqSize = static_cast<int>(sizes[2]);
+
+    std::vector seqSizes(batchSize, maxSeqSize);
+
+    set(type, inputSize, maxSeqSize, seqSizes);
+}
+
+RNNDataDescriptor::~RNNDataDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyRNNDataDescriptor(_handle)); }
+void RNNDataDescriptor::create() { CUDNN_CHECK_ERR(cudnnCreateRNNDataDescriptor(&_handle)); }
+void RNNDataDescriptor::set(
+    fl::dtype type,
+    int inputSize,
+    int maxSeqSize,
+    std::span<int const> sequenceSizes
+) const {
     CUDNN_CHECK_ERR(
-        cudnnSetRNNDescriptor_v6(
-            cudnnHandle,
-            descriptor,
-            hidden_size,
-            num_layers,
-            dropout.descriptor,
-            in_mode,
-            dir,
-            cell,
-            algo,
-            cudnntype
+        cudnnSetRNNDataDescriptor(
+            _handle,
+            cudnnMapToType(type),
+            CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,
+            maxSeqSize,
+            sequenceSizes.size(), //batch size
+            inputSize,
+            sequenceSizes.data(),
+            nullptr //no padding
         )
     );
-#endif
 }
 
-RNNDescriptor::~RNNDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyRNNDescriptor(descriptor));
 }
 
+namespace fl {
+
 ConvDescriptor::ConvDescriptor(
     fl::dtype type,
     int px,
@@ -392,7 +396,7 @@ ConvDescriptor::ConvDescriptor(
     int dy,
     int groups
 ) {
-    CUDNN_CHECK_ERR(cudnnCreateConvolutionDescriptor(&descriptor));
+    CUDNN_CHECK_ERR(cudnnCreateConvolutionDescriptor(&_handle));
     cudnnDataType_t cudnntype = cudnnMapToType(type);
     std::array<int, 2> padding = {static_cast<int>(py), static_cast<int>(px)};
     std::array<int, 2> stride = {static_cast<int>(sy), static_cast<int>(sx)};
@@ -400,7 +404,7 @@ ConvDescriptor::ConvDescriptor(
 
     CUDNN_CHECK_ERR(
         cudnnSetConvolutionNdDescriptor(
-            descriptor,
+            _handle,
             2,
             padding.data(),
             stride.data(),
@@ -410,39 +414,34 @@ ConvDescriptor::ConvDescriptor(
         )
     );
 
-    CUDNN_CHECK_ERR(cudnnSetConvolutionGroupCount(descriptor, groups));
+    CUDNN_CHECK_ERR(cudnnSetConvolutionGroupCount(_handle, groups));
 }
 
-ConvDescriptor::~ConvDescriptor() {
-    CUDNN_CHECK_ERR(cudnnDestroyConvolutionDescriptor(descriptor));
+ConvDescriptor::~ConvDescriptor() { CUDNN_CHECK_ERR(cudnnDestroyConvolutionDescriptor(_handle)); }
 }
 
+namespace fl {
+
 cudnnHandle_t getCudnnHandle() { return getActiveDeviceHandle().cudnnHandle; }
 
-const CUDAStream& getCudnnStream() { return *getActiveDeviceHandle().stream; }
+CUDAStream const& getCudnnStream() { return *getActiveDeviceHandle().stream; }
 
-const void* kOne(const fl::dtype t) {
+void const* kOne(fl::dtype const t) {
     switch(t) {
         case fl::dtype::f16:
-        case fl::dtype::f32:
-            return &kFloatOne;
-        case fl::dtype::f64:
-            return &kDoubleOne;
-        default:
-            throw std::invalid_argument("unsupported data type for cuDNN");
+        case fl::dtype::f32: return &kFloatOne;
+        case fl::dtype::f64: return &kDoubleOne;
+        default: throw std::invalid_argument("unsupported data type for cuDNN");
     }
 }
 
-const void* kZero(const fl::dtype t) {
+void const* kZero(fl::dtype const t) {
     switch(t) {
         case fl::dtype::f16:
-        case fl::dtype::f32:
-            return &kFloatZero;
-        case fl::dtype::f64:
-            return &kDoubleZero;
-        default:
-            throw std::invalid_argument("unsupported data type for cuDNN");
+        case fl::dtype::f32: return &kFloatZero;
+        case fl::dtype::f64: return &kDoubleZero;
+        default: throw std::invalid_argument("unsupported data type for cuDNN");
     }
 }
 
-} // namespace fl
+} // namespace fl
\ No newline at end of file
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h
index fca9969..76e9318 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h
@@ -1,10 +1,9 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
-
 #pragma once
 
 #include <cudnn.h>
@@ -13,35 +12,46 @@
 #include "flashlight/fl/runtime/CUDAStream.h"
 #include "flashlight/fl/tensor/TensorBase.h"
 
+#include <span>
+
 namespace fl {
 
 class TensorDescriptor {
 public:
-    explicit TensorDescriptor(const Tensor& a);
-
-    TensorDescriptor(const fl::dtype type, const Shape& af_dims);
+    explicit TensorDescriptor(Tensor const& a);
 
-    cudnnTensorDescriptor_t descriptor;
+    TensorDescriptor(fl::dtype const type, Shape const& afDims);
     ~TensorDescriptor();
+
+private:
+    cudnnTensorDescriptor_t _handle;
+
+public:
+    [[nodiscard]] constexpr auto get() const { return _handle; }
 };
 
 class TensorDescriptorArray {
 public:
-    TensorDescriptorArray(int size, const fl::dtype type, const Shape& dims);
+    TensorDescriptorArray(int size, fl::dtype const type, Shape const& dims);
 
     cudnnTensorDescriptor_t* descriptors;
     ~TensorDescriptorArray();
 
 private:
-    std::vector<TensorDescriptor> desc_vec;
-    std::vector<cudnnTensorDescriptor_t> desc_raw_vec;
+    std::vector<TensorDescriptor> _descVec;
+    std::vector<cudnnTensorDescriptor_t> _descRawVec;
 };
 
 class FilterDescriptor {
 public:
-    explicit FilterDescriptor(const Tensor& a);
-    cudnnFilterDescriptor_t descriptor;
+    explicit FilterDescriptor(Tensor const& input);
     ~FilterDescriptor();
+
+private:
+    cudnnFilterDescriptor_t _handle;
+
+public:
+    [[nodiscard]] constexpr auto get() const { return _handle; }
 };
 
 class ConvDescriptor {
@@ -56,8 +66,13 @@ class ConvDescriptor {
         int dy,
         int groups = 1
     );
-    cudnnConvolutionDescriptor_t descriptor;
     ~ConvDescriptor();
+
+private:
+    cudnnConvolutionDescriptor_t _handle;
+
+public:
+    [[nodiscard]] constexpr auto get() const { return _handle; }
 };
 
 class PoolingDescriptor {
@@ -71,45 +86,99 @@ class PoolingDescriptor {
         int py,
         PoolingMode mode
     );
-    cudnnPoolingDescriptor_t descriptor;
     ~PoolingDescriptor();
+
+private:
+    cudnnPoolingDescriptor_t _handle;
+
+public:
+    [[nodiscard]] constexpr auto get() const { return _handle; }
 };
 
 class DropoutDescriptor {
 public:
-    explicit DropoutDescriptor(float drop_prob);
-    cudnnDropoutDescriptor_t descriptor;
+    explicit DropoutDescriptor(float dropProb);
     ~DropoutDescriptor();
 
     Tensor& getDropoutStates();
+
+private:
+    cudnnDropoutDescriptor_t _handle;
+
+public:
+    [[nodiscard]] constexpr auto get() const { return _handle; }
+
 };
 
 class RNNDescriptor {
 public:
     RNNDescriptor(
         fl::dtype type,
-        int hidden_size,
-        int num_layers,
+        int inputSize,
+        int hiddenSize,
+        int numLayers,
         RnnMode mode,
         bool bidirectional,
         DropoutDescriptor& dropout
     );
-    cudnnRNNDescriptor_t descriptor;
     ~RNNDescriptor();
+
+private:
+    cudnnRNNDescriptor_t _handle = nullptr;
+
+    static constexpr auto mathType(fl::dtype type) {
+        return type == fl::dtype::f16 ? CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION : CUDNN_DEFAULT_MATH;
+    }
+
+public:
+    /**
+     * @return descriptor handle
+     */
+    constexpr auto get() const { return _handle; }
 };
 
+
+class RNNDataDescriptor {
+public:
+    RNNDataDescriptor(
+        fl::dtype type,
+        Shape const& dims
+    );
+
+    ~RNNDataDescriptor();
+
+private:
+    void create();
+    void set(dtype type, int inputSize, int maxSeqSize, std::span<int const> sequenceSizes) const;
+
+    cudnnRNNDataDescriptor_t _handle = nullptr;
+
+public:
+    /**
+     * @return descriptor handle
+     */
+    constexpr auto get() const { return _handle; }
+};
+
+}
+
+namespace fl {
+
 #define CUDNN_CHECK_ERR(expr) ::fl::cudnnCheckErr((expr))
 
 void cudnnCheckErr(cudnnStatus_t status);
 
-cudnnDataType_t cudnnMapToType(const fl::dtype& t);
+cudnnDataType_t cudnnMapToType(fl::dtype const& t);
 
-const void* kOne(const fl::dtype t);
+void const* kOne(fl::dtype const t);
 
-const void* kZero(const fl::dtype t);
+void const* kZero(fl::dtype const t);
 
 // TODO: move this to CudnnAutogradExtension if we make it a singleton
 cudnnHandle_t getCudnnHandle();
-const CUDAStream& getCudnnStream();
+CUDAStream const& getCudnnStream();
+
 
 } // namespace fl
+
+
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp
index 24b08c8..255620f 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/Pool2D.cpp
@@ -25,10 +25,10 @@ Tensor CudnnAutogradExtension::pool2d(
     const PoolingMode mode,
     std::shared_ptr<detail::AutogradPayload>
 ) {
-    auto inDesc = TensorDescriptor(input);
+    auto inDesc = TensorDescriptor{input};
 
     // init pooling descriptor
-    auto poolDesc = PoolingDescriptor(wx, wy, sx, sy, px, py, mode);
+    auto poolDesc = PoolingDescriptor{wx, wy, sx, sy, px, py, mode};
 
     // init output descriptor
     auto ix = input.dim(0);
@@ -36,7 +36,7 @@ Tensor CudnnAutogradExtension::pool2d(
     auto ox = 1 + (ix + 2 * px - wx) / sx;
     auto oy = 1 + (iy + 2 * py - wy) / sy;
 
-    auto output = Tensor(
+    auto output = Tensor{
         {
             ox,
             oy,
@@ -44,8 +44,8 @@ Tensor CudnnAutogradExtension::pool2d(
             input.ndim() < 4 ? 1 : input.dim(3)
         },
         input.type()
-    );
-    auto outDesc = TensorDescriptor(output);
+    };
+    auto outDesc = TensorDescriptor{output};
     {
         DevicePtr inputraw(input);
         DevicePtr resultraw(output);
@@ -60,12 +60,12 @@ Tensor CudnnAutogradExtension::pool2d(
         CUDNN_CHECK_ERR(
             cudnnPoolingForward(
                 handle,
-                poolDesc.descriptor,
+                poolDesc.get(),
                 one,
-                inDesc.descriptor,
+                inDesc.get(),
                 inputraw.get(),
                 zero,
-                outDesc.descriptor,
+                outDesc.get(),
                 resultraw.get()
             )
         );
@@ -90,11 +90,11 @@ Tensor CudnnAutogradExtension::pool2dBackward(
     const PoolingMode mode,
     std::shared_ptr<detail::AutogradPayload>
 ) {
-    auto i_desc = TensorDescriptor(input);
-    auto o_desc = TensorDescriptor(poolOutput);
-    auto p_desc = PoolingDescriptor(wx, wy, sx, sy, px, py, mode);
+    auto i_desc = TensorDescriptor{input};
+    auto o_desc = TensorDescriptor{poolOutput};
+    auto p_desc = PoolingDescriptor{wx, wy, sx, sy, px, py, mode};
 
-    auto gradInput = Tensor(input.shape(), input.type());
+    auto gradInput = Tensor{input.shape(), input.type()};
 
     auto hndl = getCudnnHandle();
     const auto& cudnnStream = getCudnnStream();
@@ -112,16 +112,16 @@ Tensor CudnnAutogradExtension::pool2dBackward(
         CUDNN_CHECK_ERR(
             cudnnPoolingBackward(
                 hndl,
-                p_desc.descriptor,
+                p_desc.get(),
                 oneg,
-                o_desc.descriptor,
+                o_desc.get(),
                 outraw.get(),
-                o_desc.descriptor,
+                o_desc.get(),
                 gradresultraw.get(),
-                i_desc.descriptor,
+                i_desc.get(),
                 inraw.get(),
                 zerog,
-                i_desc.descriptor,
+                i_desc.get(),
                 gradinputraw.get()
             )
         );
diff --git a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
index 17b242b..ccdeda0 100644
--- a/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
+++ b/flashlight/fl/autograd/tensor/backend/cudnn/RNN.cpp
@@ -5,404 +5,210 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+
 #include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnAutogradExtension.h"
 
 #include <cudnn.h>
 
+#include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnRnnUtils.h"
 #include "flashlight/fl/autograd/tensor/backend/cudnn/CudnnUtils.h"
-#include "flashlight/fl/common/DevicePtr.h"
-#include "flashlight/fl/tensor/Compute.h"
+
 
 namespace fl {
 namespace {
-    size_t getWorkspaceSize(
-        cudnnHandle_t handle,
-        const RNNDescriptor& rnnDesc,
-        const int seqLength,
-        const TensorDescriptorArray& xDescs
-    ) {
-        size_t workspaceSize;
-        CUDNN_CHECK_ERR(
-            cudnnGetRNNWorkspaceSize(
-                handle,
-                rnnDesc.descriptor,
-                seqLength,
-                xDescs.descriptors,
-                &workspaceSize
-            )
-        );
-        return workspaceSize;
-    }
-
-    size_t getReserveSize(
-        cudnnHandle_t handle,
-        const RNNDescriptor& rnnDesc,
-        const int seqLength,
-        const TensorDescriptorArray& xDescs
-    ) {
-        size_t reserveSize;
-        CUDNN_CHECK_ERR(
-            cudnnGetRNNTrainingReserveSize(
-                handle,
-                rnnDesc.descriptor,
-                seqLength,
-                xDescs.descriptors,
-                &reserveSize
-            )
-        );
-        return reserveSize;
-    }
-
-    void setCudnnRnnMathType(const Tensor& input, const RNNDescriptor& rnnDesc) {
-        if(input.type() == fl::dtype::f16)
-            CUDNN_CHECK_ERR(
-                cudnnSetRNNMatrixMathType(
-                    rnnDesc.descriptor,
-                    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
-                )
-            );
-        else
-            CUDNN_CHECK_ERR(
-                cudnnSetRNNMatrixMathType(rnnDesc.descriptor, CUDNN_DEFAULT_MATH)
-            );
-    }
-
     struct CudnnRnnAutogradPayload : public detail::AutogradPayloadData {
         Tensor reserveSpace;
     };
-
 } // namespace
 
+void CudnnAutogradExtension::checkHiddenStateDims(
+    int const hiddenSize,
+    Tensor const& hiddenState,
+    int batchSize,
+    int totalLayers
+) {
+    auto const& hxDims = hiddenState.shape();
+    int const hxHiddenSize = static_cast<int>(hxDims[0]);
+    int const hxBatchSize = hiddenState.ndim() < 2 ? 1 : static_cast<int>(hxDims[1]);
+    int const hxTotalLayers = hiddenState.ndim() < 3 ? 1 : static_cast<int>(hxDims[2]);
+
+    if(
+        hxHiddenSize != hiddenSize || hxBatchSize != batchSize
+        || hxTotalLayers != totalLayers
+    )
+        throw std::invalid_argument("invalid hidden state dims for RNN");
+}
+void CudnnAutogradExtension::checkCellStateDims(
+    int const hiddenSize,
+    RnnMode const mode,
+    Tensor const& cellState,
+    int batchSize,
+    int totalLayers
+) {
+    if(mode != RnnMode::LSTM || cellState.dim(0) != hiddenSize
+        || cellState.dim(1) != batchSize || cellState.dim(2) != totalLayers)
+        throw std::invalid_argument("invalid cell state dims for RNN");
+}
+
 std::tuple<Tensor, Tensor, Tensor> CudnnAutogradExtension::rnn(
-    const Tensor& input,
-    const Tensor& hiddenStateIn,
-    const Tensor& cellStateIn,
-    const Tensor& weights,
-    const int hiddenSize,
-    const int numLayers,
-    const RnnMode mode,
-    const bool bidirectional,
-    const float dropProb,
+    Tensor const& input,
+    Tensor const& hiddenState,
+    Tensor const& cellState,
+    Tensor const& weights,
+    int const hiddenSize,
+    int const numLayers,
+    RnnMode const mode,
+    bool const bidirectional,
+    float const dropProb,
     std::shared_ptr<detail::AutogradPayload> autogradPayload
 ) {
-    FL_TENSOR_DTYPES_MATCH_CHECK(input, hiddenStateIn, cellStateIn, weights);
+    FL_TENSOR_DTYPES_MATCH_CHECK(input, hiddenState, cellState, weights);
 
-    bool train = (autogradPayload != nullptr);
-    auto payload = std::make_shared<CudnnRnnAutogradPayload>();
+    bool const train = (autogradPayload != nullptr);
+    auto const payload = std::make_shared<CudnnRnnAutogradPayload>();
     if(train)
         autogradPayload->data = payload;
 
-    Tensor x = input.asContiguousTensor();
-    Tensor hiddenState = hiddenStateIn.asContiguousTensor();
-    Tensor cellState = cellStateIn.asContiguousTensor();
+    auto const x = input.asContiguousTensor();
+
+    auto const cHiddenState = hiddenState.asContiguousTensor();
+    auto const cCellState = cellState.asContiguousTensor();
+
+    DropoutDescriptor dropout{dropProb};
 
-    DropoutDescriptor dropout(dropProb);
-    RNNDescriptor rnnDesc(
-        input.type(), hiddenSize, numLayers, mode, bidirectional, dropout);
-    setCudnnRnnMathType(input, rnnDesc);
+    auto const& dims = max(x.shape(), {1, 1, 1});
+
+
+    auto const inputSize = static_cast<int>(dims[0]);
+    auto batchSize = static_cast<int>(dims[1]);
+    auto seqLength = static_cast<int>(dims[2]);
+
+
+    RNNDescriptor const rnnDesc{
+        input.type(),
+        inputSize,
+        hiddenSize,
+        numLayers,
+        mode,
+        bidirectional,
+        dropout
+    };
 
-    auto dims = x.shape();
-    int inputSize = dims[0];
-    int batchSize = dims.ndim() < 2 ? 1 : dims[1];
-    int seqLength = dims.ndim() < 3 ? 1 : dims[2];
 
     int totalLayers = numLayers * (bidirectional ? 2 : 1);
     int outSize = hiddenSize * (bidirectional ? 2 : 1);
 
-    TensorDescriptorArray xDescs(
-        seqLength, x.type(), {1, 1, inputSize, batchSize});
+    if(!cHiddenState.isEmpty())
+        checkHiddenStateDims(hiddenSize, cHiddenState, batchSize, totalLayers);
 
-    if(!hiddenState.isEmpty()) {
-        auto hxDims = hiddenState.shape();
-        int hxHiddenSize = hxDims[0];
-        int hxBatchSize = hiddenState.ndim() < 2 ? 1 : hxDims[1];
-        int hxTotalLayers = hiddenState.ndim() < 3 ? 1 : hxDims[2];
+    if(!cCellState.isEmpty())
+        checkCellStateDims(hiddenSize, mode, cCellState, batchSize, totalLayers);
 
-        if(
-            !(hxHiddenSize == hiddenSize && hxBatchSize == batchSize
-            && hxTotalLayers == totalLayers)
-        )
-            throw std::invalid_argument("invalid hidden state dims for RNN");
-    }
 
-    if(
-        !cellState.isEmpty()
-        && !(mode == RnnMode::LSTM && cellState.dim(0) == hiddenSize
-        && cellState.dim(1) == batchSize && cellState.dim(2) == totalLayers)
-    )
-        throw std::invalid_argument("invalid cell state dims for RNN");
+    Shape const hDims = {1, hiddenSize, batchSize, totalLayers};
+    TensorDescriptor const hxDesc{x.type(), hDims};
+    TensorDescriptor const cxDesc{x.type(), hDims};
 
-    Shape hDims = {1, hiddenSize, batchSize, totalLayers};
-    TensorDescriptor hxDesc(x.type(), hDims);
-    TensorDescriptor cxDesc(x.type(), hDims);
-
-    auto handle = getCudnnHandle();
-    const auto& cudnnStream = getCudnnStream();
-
-    size_t paramSize;
-    CUDNN_CHECK_ERR(
-        cudnnGetRNNParamsSize(
-            handle,
-            rnnDesc.descriptor,
-            xDescs.descriptors[0],
-            &paramSize,
-            cudnnMapToType(weights.type())
-        )
-    );
-    if(paramSize != weights.bytes())
-        throw std::invalid_argument(
-            "invalid # of parameters or wrong input shape for RNN"
-        );
-    FilterDescriptor wDesc(weights);
+    Tensor y{{outSize, batchSize, seqLength}, input.type()};
 
-    Tensor y({outSize, batchSize, seqLength}, input.type());
-    TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize});
+    Tensor hy{{hiddenSize, batchSize, totalLayers}, x.type()};
 
-    Tensor hy({hiddenSize, batchSize, totalLayers}, x.type());
-    TensorDescriptor hyDesc(x.type(), hDims);
-
-    Tensor cy;
+    Tensor cy{};
     if(mode == RnnMode::LSTM)
-        cy = Tensor(hy.shape(), x.type());
-
-    TensorDescriptor cyDesc(x.type(), hDims);
-
-    size_t workspaceSize = getWorkspaceSize(handle, rnnDesc, seqLength, xDescs);
-    size_t reserveSize = getReserveSize(handle, rnnDesc, seqLength, xDescs);
-
-    Tensor workspace({static_cast<long long>(workspaceSize)}, fl::dtype::b8);
-    // Space must be reused between forward and backward for cuDNN
-    payload->reserveSpace = Tensor({static_cast<long long>(reserveSize)}, fl::dtype::b8);
-
-    {
-        auto contiguousX = x.asContiguousTensor();
-        auto contiguousWeights = weights.asContiguousTensor();
-        DevicePtr xRaw(contiguousX);
-        DevicePtr hxRaw(hiddenState);
-        DevicePtr cxRaw(cellState);
-        DevicePtr wRaw(contiguousWeights);
-        DevicePtr yRaw(y);
-        DevicePtr hyRaw(hy);
-        DevicePtr cyRaw(cy);
-        DevicePtr workspaceRaw(workspace);
-        DevicePtr reserveSpaceRaw(payload->reserveSpace);
-        // ensure cudnn compute stream waits on input/output tensor streams
-        relativeSync(
-            cudnnStream,
-            {
-                contiguousX, hiddenState, cellState, contiguousWeights, y, hy, cy,
-                workspace, payload->reserveSpace,
-            }
-        );
-
-        CUDNN_CHECK_ERR(
-            cudnnRNNForwardTraining(
-                handle,
-                rnnDesc.descriptor,
-                seqLength,
-                xDescs.descriptors,
-                xRaw.get(),
-                hxDesc.descriptor,
-                hxRaw.get(),
-                cxDesc.descriptor,
-                cxRaw.get(),
-                wDesc.descriptor,
-                wRaw.get(),
-                yDesc.descriptors,
-                yRaw.get(),
-                hyDesc.descriptor,
-                hyRaw.get(),
-                cyDesc.descriptor,
-                cyRaw.get(),
-                workspaceRaw.get(),
-                workspaceSize,
-                reserveSpaceRaw.get(),
-                reserveSize
-            )
-        );
-    }
+        cy = Tensor{hy.shape(), x.type()};
+
+    cudnn_rnn_forward(
+        batchSize,
+        seqLength,
+        train,
+        rnnDesc,
+        x,
+        y,
+        weights,
+        cxDesc,
+        hxDesc,
+        hy,
+        cy,
+        cHiddenState,
+        cCellState,
+        payload->reserveSpace // output
+    );
 
-    // ensure output tensor streams wait on cudnn compute stream
-    relativeSync({y, hy, cy}, cudnnStream);
     return std::make_tuple(y, hy, cy);
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> CudnnAutogradExtension::rnnBackward(
-    const Tensor& input,
-    const Tensor& hiddenState,
-    const Tensor& cellState,
-    const Tensor& weights,
-    const std::shared_ptr<detail::RNNGradData> gradData,
-    const Tensor& output,
-    const int numLayers,
-    const int hiddenSize,
-    const RnnMode mode,
-    const bool bidirectional,
-    const float dropProb,
+    Tensor const& input,
+    Tensor const& hiddenState,
+    Tensor const& cellState,
+    Tensor const& weights,
+    std::shared_ptr<detail::RNNGradData> const gradData,
+    Tensor const& output,
+    int const numLayers,
+    int const hiddenSize,
+    RnnMode const mode,
+    bool const bidirectional,
+    float const dropProb,
     std::shared_ptr<detail::AutogradPayload> autogradPayload
 ) {
     if(!autogradPayload)
         throw std::invalid_argument(
             "CudnnAutogradExtension::rnnBackward given null detail::AutogradPayload"
         );
-    auto payload =
-        std::static_pointer_cast<CudnnRnnAutogradPayload>(autogradPayload->data);
+    auto const payload = std::static_pointer_cast<CudnnRnnAutogradPayload>(autogradPayload->data);
 
-    auto handle = getCudnnHandle();
-    const auto& cudnnStream = getCudnnStream();
-
-    auto x = input.asContiguousTensor();
+    auto const x = input.asContiguousTensor();
     auto& y = output;
 
-    auto dims = x.shape();
-    int inputSize = dims[0];
-    int batchSize = dims.ndim() < 2 ? 1 : dims[1];
-    int seqLength = dims.ndim() < 3 ? 1 : dims[2];
-    int totalLayers = numLayers * (bidirectional ? 2 : 1);
-    int outSize = hiddenSize * (bidirectional ? 2 : 1);
-
-    DropoutDescriptor dropout(dropProb);
-    RNNDescriptor rnnDesc(input.type(), hiddenSize, numLayers, mode, bidirectional, dropout);
-    setCudnnRnnMathType(input, rnnDesc);
-
-    TensorDescriptorArray yDesc(seqLength, y.type(), {1, 1, outSize, batchSize});
-    TensorDescriptorArray dyDesc(seqLength, y.type(), {1, 1, outSize, batchSize});
-
-    Shape hDims = {1, hiddenSize, batchSize, totalLayers};
-    TensorDescriptor dhyDesc(x.type(), hDims);
-    TensorDescriptor dcyDesc(x.type(), hDims);
-    TensorDescriptor hxDesc(x.type(), hDims);
-    TensorDescriptor cxDesc(x.type(), hDims);
+    auto const& dims = x.shape();
+    int const inputSize = dims[0];
+    int const batchSize = dims.ndim() < 2 ? 1 : dims[1];
+    int const seqLength = dims.ndim() < 3 ? 1 : dims[2];
+    int const totalLayers = numLayers * (bidirectional ? 2 : 1);
 
-    Tensor dhx(hiddenState.shape(), hiddenState.type());
-    Tensor dcx(cellState.shape(), cellState.type());
-    TensorDescriptor dhxDesc(x.type(), hDims);
-    TensorDescriptor dcxDesc(x.type(), hDims);
+    DropoutDescriptor dropout{dropProb};
+    RNNDescriptor const rnnDesc{input.type(), inputSize, hiddenSize, numLayers, mode, bidirectional, dropout};
 
-    FilterDescriptor wDesc(weights);
+    Shape const hDims = {1, hiddenSize, batchSize, totalLayers};
+    TensorDescriptor const hxDesc{x.type(), hDims};
+    TensorDescriptor const cxDesc{x.type(), hDims};
 
-    Tensor dx(input.shape(), input.type());
-    TensorDescriptorArray dxDescs(
-        seqLength, dx.type(), {1, 1, inputSize, batchSize});
+    Tensor dhx{hiddenState.shape(), hiddenState.type()};
+    Tensor dcx{cellState.shape(), cellState.type()};
 
-    size_t workspaceSize =
-        getWorkspaceSize(handle, rnnDesc, seqLength, dxDescs);
-    Tensor workspace({static_cast<long long>(workspaceSize)}, fl::dtype::b8);
+    Tensor dx{input.shape(), input.type()};
+    Tensor dw = fl::full(weights.shape(), 0, weights.type());
 
     auto& dy = gradData->dy;
     if(dy.isEmpty())
         dy = fl::full(y.shape(), 0.0, y.type());
-    auto& dhy = gradData->dhy;
-    auto& dcy = gradData->dcy;
-
-    DevicePtr yRaw(output);
-    DevicePtr workspaceRaw(workspace);
-    DevicePtr reserveSpaceRaw(payload->reserveSpace);
-    // ensure cudnn compute stream waits on input/output tensor streams
-    relativeSync(cudnnStream, {output, workspace, payload->reserveSpace});
-
-    {
-        DevicePtr dyRaw(dy); // Has to be set to 0 if empty
-        DevicePtr dhyRaw(dhy);
-        DevicePtr dcyRaw(dcy);
-
-        DevicePtr wRaw(weights);
-
-        DevicePtr hxRaw(hiddenState);
-        DevicePtr cxRaw(cellState);
-
-        DevicePtr dxRaw(dx);
-        DevicePtr dhxRaw(dhx);
-        DevicePtr dcxRaw(dcx);
-        // ensure cudnn compute stream waits on input/output tensor streams
-        relativeSync(
-            cudnnStream,
-            {dy, dhy, dcy, weights, hiddenState, cellState, dx, dhx, dcx}
-        );
-
-        /* We need to update reserveSpace even if we just want the
-         * weight gradients. */
-        CUDNN_CHECK_ERR(
-            cudnnRNNBackwardData(
-                handle,
-                rnnDesc.descriptor,
-                seqLength,
-                yDesc.descriptors,
-                yRaw.get(),
-                dyDesc.descriptors,
-                dyRaw.get(),
-                dhyDesc.descriptor,
-                dhyRaw.get(),
-                dcyDesc.descriptor,
-                dcyRaw.get(),
-                wDesc.descriptor,
-                wRaw.get(),
-                hxDesc.descriptor,
-                hxRaw.get(),
-                cxDesc.descriptor,
-                cxRaw.get(),
-                dxDescs.descriptors,
-                dxRaw.get(),
-                dhxDesc.descriptor,
-                dhxRaw.get(),
-                dcxDesc.descriptor,
-                dcxRaw.get(),
-                workspaceRaw.get(),
-                workspaceSize,
-                reserveSpaceRaw.get(),
-                payload->reserveSpace.bytes()
-            )
-        );
-    }
-
-    if(input.type() == fl::dtype::f16)
-        CUDNN_CHECK_ERR(
-            cudnnSetRNNMatrixMathType(
-                rnnDesc.descriptor,
-                CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
-            )
-        );
-    else
-        CUDNN_CHECK_ERR(
-            cudnnSetRNNMatrixMathType(rnnDesc.descriptor, CUDNN_DEFAULT_MATH)
-        );
-    TensorDescriptorArray xDescs(
-        seqLength, x.type(), {1, 1, inputSize, batchSize});
-    Tensor dw = fl::full(weights.shape(), 0, weights.type());
-
-    FilterDescriptor dwDesc(dw);
-
-    {
-        DevicePtr xRaw(x);
-        DevicePtr dwRaw(dw);
-        DevicePtr hxRaw(hiddenState);
-        // ensure cudnn compute stream waits on input/output tensor streams
-        relativeSync(cudnnStream, {x, dw, hiddenState});
-
-        CUDNN_CHECK_ERR(
-            cudnnRNNBackwardWeights(
-                handle,
-                rnnDesc.descriptor,
-                seqLength,
-                xDescs.descriptors,
-                xRaw.get(),
-                hxDesc.descriptor,
-                hxRaw.get(),
-                yDesc.descriptors,
-                yRaw.get(),
-                workspaceRaw.get(),
-                workspaceSize,
-                dwDesc.descriptor,
-                dwRaw.get(),
-                reserveSpaceRaw.get(),
-                payload->reserveSpace.bytes()
-            )
-        );
-    }
+    auto const& dhy = gradData->dhy;
+    auto const& dcy = gradData->dcy;
+
+    cudnn_rnn_backward(
+        batchSize,
+        seqLength,
+        rnnDesc,
+        x,
+        y,
+        dy,
+        weights,
+        cxDesc,
+        hxDesc,
+        dhy,
+        dcy,
+        hiddenState,
+        cellState,
+        dx,
+        dhx,
+        dcx,
+        dw,
+        payload->reserveSpace
+    );
 
-    // ensure output tensor streams wait on cudnn compute stream
-    relativeSync({dx, dhx, dcx, dw}, cudnnStream);
     return std::make_tuple(dx, dhx, dcx, dw);
 }
 
+
+
 } // namespace fl
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp b/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp
index b673281..7c0e948 100644
--- a/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp
+++ b/flashlight/fl/autograd/tensor/backend/onednn/BatchNorm.cpp
@@ -49,7 +49,7 @@ namespace {
                     1,
                     1,
                     nfeatures,
-                    static_cast<long long>(input.elements() / nfeatures)
+                    static_cast<int64_t>(input.elements() / nfeatures)
                 }
             );
         else {
@@ -59,7 +59,7 @@ namespace {
             inDescDims = Shape(
                 {
                     1,
-                    static_cast<long long>(input.elements() / (nfeatures * batchsz)),
+                    static_cast<int64_t>(input.elements() / (nfeatures * batchsz)),
                     nfeatures,
                     batchsz
                 }
diff --git a/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp b/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp
index f866f1c..06fe93c 100644
--- a/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp
+++ b/flashlight/fl/autograd/tensor/backend/onednn/RNN.cpp
@@ -97,7 +97,7 @@ namespace {
             }
         }
 
-        auto weightsFlat = weights.flatten().astype(weights.type());
+        auto weightsFlat = weights.flatten().asType(weights.type());
         // cuDNN RNN weights, for each layer, are arranged with a chunk of
         // input-hidden weights for each layer followed by a chunk of hidden-hidden
         // weights for each layer:
diff --git a/flashlight/fl/common/DevicePtr.h b/flashlight/fl/common/DevicePtr.h
index 4666413..3bdedea 100644
--- a/flashlight/fl/common/DevicePtr.h
+++ b/flashlight/fl/common/DevicePtr.h
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #pragma once
@@ -68,7 +68,7 @@ class FL_API DevicePtr {
 
     template<typename T>
     T* getAs() const {
-        return reinterpret_cast<T*>(ptr_);
+        return static_cast<T*>(ptr_);
     }
 
 protected:
diff --git a/flashlight/fl/common/Types.h b/flashlight/fl/common/Types.h
index dcb3168..3537402 100644
--- a/flashlight/fl/common/Types.h
+++ b/flashlight/fl/common/Types.h
@@ -16,7 +16,7 @@
 namespace fl {
 
 namespace detail {
-
+    // TODO remove, somebody smoked something before writing this
 /**
  * Precision specifications for autograd operators based on optimization level.
  */
diff --git a/flashlight/fl/contrib/modules/AdaptiveEmbedding.cpp b/flashlight/fl/contrib/modules/AdaptiveEmbedding.cpp
index 4fe16aa..a042370 100644
--- a/flashlight/fl/contrib/modules/AdaptiveEmbedding.cpp
+++ b/flashlight/fl/contrib/modules/AdaptiveEmbedding.cpp
@@ -83,7 +83,7 @@ Variable AdaptiveEmbedding::forward(const Variable& input) {
     for(int tailIdx = 1; tailIdx < cutoff_.size(); tailIdx++) {
         Tensor tailMask = flatInput.tensor() < cutoff_[tailIdx]
             && flatInput.tensor() >= cutoff_[tailIdx - 1];
-        if(fl::any(tailMask).asScalar<bool>()) {
+        if(fl::any_of(tailMask).asScalar<bool>()) {
             auto tailEmbedding = embedding(
                 flatInput(tailMask) - cutoff_[tailIdx - 1],
                 reorder(params_[tailIdx * 2], {1, 0})
diff --git a/flashlight/fl/contrib/modules/CMakeLists.txt b/flashlight/fl/contrib/modules/CMakeLists.txt
index c1f56a6..3da2d83 100644
--- a/flashlight/fl/contrib/modules/CMakeLists.txt
+++ b/flashlight/fl/contrib/modules/CMakeLists.txt
@@ -13,4 +13,4 @@ target_sources(
   ${CMAKE_CURRENT_LIST_DIR}/Transformer.cpp
   ${CMAKE_CURRENT_LIST_DIR}/TDSBlock.cpp
   ${CMAKE_CURRENT_LIST_DIR}/SpecAugment.cpp
-  )
+)
diff --git a/flashlight/fl/contrib/modules/Conformer.cpp b/flashlight/fl/contrib/modules/Conformer.cpp
index 6559c08..5a7b16a 100644
--- a/flashlight/fl/contrib/modules/Conformer.cpp
+++ b/flashlight/fl/contrib/modules/Conformer.cpp
@@ -189,7 +189,7 @@ Variable Conformer::mhsa(const Variable& input, const Variable& inputPadMask) {
 
     Variable mask, posEmb;
     if(posEmbContextSize_ > 0)
-        posEmb = tile(params_[0].astype(input.type()), {1, 1, nHeads_ * bsz});
+        posEmb = tile(params_[0].asType(input.type()), {1, 1, nHeads_ * bsz});
 
     fl::Variable padMask;
     // TODO{fl::Tensor}{resize} - emulate the ArrayFire resize operation for
@@ -222,7 +222,7 @@ Variable Conformer::conv(const Variable& _input) {
     // input C x T x B x 1
     // apply first pointwise conv
     auto result = gatedlinearunit(
-        (*conv1_)(((*normConv1_)(input)).astype(input.type())),
+        (*conv1_)(((*normConv1_)(input)).asType(input.type())),
         0
     );
     result = reorder(result, {1, 3, 0, 2});
@@ -231,7 +231,7 @@ Variable Conformer::conv(const Variable& _input) {
     result = (*convDepthWise_)(result);
     result = reorder(result, {2, 0, 3, 1});
     // C x T x B x 1
-    result = fl::swish(((*normConv2_)(result)).astype(input.type()), 1.);
+    result = fl::swish(((*normConv2_)(result)).asType(input.type()), 1.);
     // apply second pointwise conv
     result = dropout((*conv2_)(result), pDropout);
     return moddims(result, _input.shape());
@@ -260,7 +260,7 @@ std::vector<Variable> Conformer::forward(const std::vector<Variable>& input) {
     auto ffn1 = dropout(
         (*w12_)(
             dropout(
-                fl::swish((*w11_)(((*norm1_)(x)).astype(x.type())), 1.),
+                fl::swish((*w11_)(((*norm1_)(x)).asType(x.type())), 1.),
                 pDropout
             )
         ),
@@ -275,14 +275,14 @@ std::vector<Variable> Conformer::forward(const std::vector<Variable>& input) {
     auto ffn2 = dropout(
         (*w22_)(
             dropout(
-                fl::swish((*w21_)(((*norm2_)(x)).astype(x.type())), 1.),
+                fl::swish((*w21_)(((*norm2_)(x)).asType(x.type())), 1.),
                 pDropout
             )
         ),
         pDropout
     );
     x = x + f * 0.5 * ffn2;
-    x = ((*norm3_)(x)).astype(x.type());
+    x = ((*norm3_)(x)).asType(x.type());
     return {x};
 }
 
diff --git a/flashlight/fl/contrib/modules/PositionEmbedding.cpp b/flashlight/fl/contrib/modules/PositionEmbedding.cpp
index a95bb08..66af82b 100644
--- a/flashlight/fl/contrib/modules/PositionEmbedding.cpp
+++ b/flashlight/fl/contrib/modules/PositionEmbedding.cpp
@@ -50,7 +50,7 @@ std::vector<Variable> PositionEmbedding::forward(
 
     int n = input[0].dim(1);
     Variable posEmb = tileAs(
-        params_[0].astype(input[0].type())(fl::span, fl::range(0, n)), input[0]);
+        params_[0].asType(input[0].type())(fl::span, fl::range(0, n)), input[0]);
         if(dropout_ > 0.0 && train_)
             return {input[0] + dropout(posEmb, dropout_)};
         else
diff --git a/flashlight/fl/contrib/modules/Residual.cpp b/flashlight/fl/contrib/modules/Residual.cpp
index 31f800b..f7d6a61 100644
--- a/flashlight/fl/contrib/modules/Residual.cpp
+++ b/flashlight/fl/contrib/modules/Residual.cpp
@@ -99,7 +99,7 @@ Variable Residual::forward(const Variable& input) {
                     connectionOut = modules_[shortcut.second]
                         ->forward({outputs[shortcut.first]})
                         .front();
-                output = output + connectionOut.astype(output.type());
+                output = output + connectionOut.asType(output.type());
             }
         output = modules_[moduleIndex]
             ->forward({applyScale(output, layerIndex)})
@@ -115,7 +115,7 @@ Variable Residual::forward(const Variable& input) {
                 connectionOut = modules_[shortcut.second]
                     ->forward({outputs[shortcut.first]})
                     .front();
-            output = output + connectionOut.astype(output.type());
+            output = output + connectionOut.asType(output.type());
         }
     return applyScale(output, nLayers);
 }
diff --git a/flashlight/fl/contrib/modules/SinusoidalPositionEmbedding.cpp b/flashlight/fl/contrib/modules/SinusoidalPositionEmbedding.cpp
index 22ddc2a..5c55b41 100644
--- a/flashlight/fl/contrib/modules/SinusoidalPositionEmbedding.cpp
+++ b/flashlight/fl/contrib/modules/SinusoidalPositionEmbedding.cpp
@@ -79,8 +79,8 @@ std::vector<Variable> SinusoidalPositionEmbedding::forward(
     // Generate the embedding transformation with the precomputed scale and shift
     // factors.
     positions = fl::sin(
-        positions * fl::tile(scale_.astype(numType), {1, nPositions})
-        + fl::tile(cosShifts_.astype(numType), {1, nPositions})
+        positions * fl::tile(scale_.asType(numType), {1, nPositions})
+        + fl::tile(cosShifts_.asType(numType), {1, nPositions})
     );
     // Convert the positional embedding into a variable (for gradient tracking).
     Variable embeddingsPos = Variable(positions, false);
diff --git a/flashlight/fl/contrib/modules/TDSBlock.cpp b/flashlight/fl/contrib/modules/TDSBlock.cpp
index 3720ca9..bd7a86f 100644
--- a/flashlight/fl/contrib/modules/TDSBlock.cpp
+++ b/flashlight/fl/contrib/modules/TDSBlock.cpp
@@ -69,9 +69,9 @@ TDSBlock::TDSBlock(
 
 std::vector<Variable> TDSBlock::forward(const std::vector<Variable>& inputs) {
     auto out = inputs[0];
-    out = module(0)->forward({out})[0].astype(out.type()) + out;
+    out = module(0)->forward({out})[0].asType(out.type()) + out;
     out = module(1)->forward({out})[0];
-    out = module(2)->forward({out})[0].astype(out.type()) + out;
+    out = module(2)->forward({out})[0].asType(out.type()) + out;
     return module(3)->forward({out});
 }
 
diff --git a/flashlight/fl/contrib/modules/Transformer.cpp b/flashlight/fl/contrib/modules/Transformer.cpp
index 5eff578..474ef1c 100644
--- a/flashlight/fl/contrib/modules/Transformer.cpp
+++ b/flashlight/fl/contrib/modules/Transformer.cpp
@@ -129,7 +129,7 @@ Variable Transformer::selfAttention(const std::vector<Variable>& input) {
     Variable mask, posEmb;
     if(bptt_ > 0)
         posEmb =
-            tile(params_[0].astype(encoderInput.type()), {1, 1, nHeads_ * bsz});
+            tile(params_[0].asType(encoderInput.type()), {1, 1, nHeads_ * bsz});
     if(useMask_ && encoderInput.dim(1) > 1)
         // mask future if we use the previous state (then n is previous time)
         mask = getMask(n, input.size() == 3);
@@ -201,11 +201,11 @@ std::vector<Variable> Transformer::forward(const std::vector<Variable>& input) {
     if(train_ && (fl::rand({1}).scalar<float>() < pLayerdrop_))
         f = 0.0;
     if(preLN_) {
-        auto h = (f * (*norm1_)(selfAttention(input))).astype(x.type()) + x;
-        return {f* (*norm2_)(mlp(h)).astype(h.type()) + h};
+        auto h = (f * (*norm1_)(selfAttention(input))).asType(x.type()) + x;
+        return {f* (*norm2_)(mlp(h)).asType(h.type()) + h};
     } else {
-        auto h = (*norm1_)((f* selfAttention(input)).astype(x.type()) + x);
-        return {(*norm2_)((f* mlp(h)).astype(h.type()) + h)};
+        auto h = (*norm1_)((f* selfAttention(input)).asType(x.type()) + x);
+        return {(*norm2_)((f* mlp(h)).asType(h.type()) + h)};
     }
 }
 
diff --git a/flashlight/fl/distributed/backend/cpu/DistributedBackend.cpp b/flashlight/fl/distributed/backend/cpu/DistributedBackend.cpp
index c73eaaf..210d70d 100644
--- a/flashlight/fl/distributed/backend/cpu/DistributedBackend.cpp
+++ b/flashlight/fl/distributed/backend/cpu/DistributedBackend.cpp
@@ -110,7 +110,7 @@ void allReduce(fl::Tensor& tensor, bool async /* = false */) {
     size_t tensorSize = tensor.elements() * fl::getTypeSize(tensor.type());
     if(tensorSize > cacheTensor_.elements())
         cacheTensor_ =
-            fl::Tensor({static_cast<long long>(tensorSize)}, fl::dtype::b8);
+            fl::Tensor({static_cast<int64_t>(tensorSize)}, fl::dtype::b8);
     DevicePtr tensorPtr(tensor);
     DevicePtr cacheTensorPtr(cacheTensor_);
     memcpy(cacheTensorPtr.get(), tensorPtr.get(), tensorSize);
diff --git a/flashlight/fl/examples/Benchmark.cpp b/flashlight/fl/examples/Benchmark.cpp
index dde158e..b8d9710 100644
--- a/flashlight/fl/examples/Benchmark.cpp
+++ b/flashlight/fl/examples/Benchmark.cpp
@@ -70,7 +70,7 @@ double embedding() {
 
     int num_elems = 400;
     Variable input(
-        (fl::rand({num_elems}) * vocab_size).astype(fl::dtype::s32), false);
+        (fl::rand({num_elems}) * vocab_size).asType(fl::dtype::s32), false);
     Variable grad_output(
         fl::randn({embed_dim, num_elems}, fl::dtype::f32), false);
 
diff --git a/flashlight/fl/examples/CMakeLists.txt b/flashlight/fl/examples/CMakeLists.txt
index 187e738..f60f02a 100644
--- a/flashlight/fl/examples/CMakeLists.txt
+++ b/flashlight/fl/examples/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.16)
 project(flashlight-examples LANGUAGES CXX C VERSION 0.4.0)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 # If building in source, we already have the flashlight target
diff --git a/flashlight/fl/examples/Mnist.cpp b/flashlight/fl/examples/Mnist.cpp
index 481e77e..6d23657 100644
--- a/flashlight/fl/examples/Mnist.cpp
+++ b/flashlight/fl/examples/Mnist.cpp
@@ -221,7 +221,7 @@ int read_int(std::ifstream& f) {
 template<typename T>
 Tensor load_data(
     const std::string& im_file,
-    const std::vector<long long int>& dims
+    const std::vector<int64_t>& dims
 ) {
     std::ifstream file(im_file, std::ios::binary);
     if(!file.is_open())
@@ -243,7 +243,7 @@ Tensor load_data(
         data.push_back(tmp);
     }
 
-    std::vector<long long int> rdims(dims.rbegin(), dims.rend());
+    std::vector rdims(dims.rbegin(), dims.rend());
     // af is column-major
     return Tensor::fromBuffer(Shape(rdims), data.data(), MemoryLocation::Host);
 }
diff --git a/flashlight/fl/meter/TopKMeter.cpp b/flashlight/fl/meter/TopKMeter.cpp
index 399cff0..c5b9b6b 100644
--- a/flashlight/fl/meter/TopKMeter.cpp
+++ b/flashlight/fl/meter/TopKMeter.cpp
@@ -27,7 +27,7 @@ void TopKMeter::add(const Tensor& output, const Tensor& target) {
     Tensor maxVals, maxIds, match;
     topk(maxVals, maxIds, output, k_, 0);
     match = maxIds == fl::reshape(target, {1, target.dim(0), 1, 1});
-    const Tensor correct = fl::any(match, {0});
+    const Tensor correct = fl::any_of(match, {0});
 
     correct_ += fl::countNonzero(correct).asScalar<int32_t>();
     const int batchsize = target.dim(0);
diff --git a/flashlight/fl/nn/Init.cpp b/flashlight/fl/nn/Init.cpp
index 4644b33..af458cc 100644
--- a/flashlight/fl/nn/Init.cpp
+++ b/flashlight/fl/nn/Init.cpp
@@ -71,7 +71,7 @@ namespace detail {
     }
 
     Tensor erfinv(const Tensor& y) {
-        if(fl::any(fl::abs(y) >= 1.).scalar<char>())
+        if(fl::any_of(fl::abs(y) >= 1.).scalar<char>())
             throw std::runtime_error("[erfinv] input is out of range (-1, 1)");
         double a[4] = {0.886226899, -1.645349621, 0.914624893, -0.140543331};
         double b[4] = {-2.118377725, 1.442710462, -0.329097515, 0.012229801};
@@ -90,7 +90,7 @@ namespace detail {
         num = ((c[3] * z + c[2]) * z + c[1]) * z + c[0];
         dem = (d[1] * z + d[0]) * z + 1.0;
         // TODO{fl::Tensor}{operator} - check af::sign - zero case?
-        z = fl::sign(y).astype(fl::dtype::f32); // -1 for negative, 1 for positive
+        z = fl::sign(y).asType(fl::dtype::f32); // -1 for negative, 1 for positive
         z = z * num / dem;
         x = x + z * !centralMask;
 
@@ -98,8 +98,8 @@ namespace detail {
         x = x - (fl::erf(x) - y) / ((2.0 / std::sqrt(M_PI)) * fl::exp(-x * x));
         x = x - (fl::erf(x) - y) / ((2.0 / std::sqrt(M_PI)) * fl::exp(-x * x));
         if(
-            fl::any(fl::isnan(x)).asScalar<bool>()
-            || fl::any(fl::isinf(x)).asScalar<bool>()
+            fl::any_of(fl::isnan(x)).asScalar<bool>()
+            || fl::any_of(fl::isinf(x)).asScalar<bool>()
         )
             throw std::runtime_error("[erfinv] invalid result");
         return x;
diff --git a/flashlight/fl/nn/Init.h b/flashlight/fl/nn/Init.h
index 6bb0422..b8cc6ac 100644
--- a/flashlight/fl/nn/Init.h
+++ b/flashlight/fl/nn/Init.h
@@ -289,7 +289,7 @@ FL_API Variable constant(
  * \ingroup nn_init_utils
  */
 template<typename T>
-Variable scalar(T val, fl::dtype type = dtype_traits<T>::ctype, bool calcGrad = true) {
+Variable scalar(T val, fl::dtype type = dtype_traits<T>::fl_type, bool calcGrad = true) {
     return Variable(fromScalar(val, type), calcGrad);
 }
 
diff --git a/flashlight/fl/nn/modules/AdaptiveSoftMax.cpp b/flashlight/fl/nn/modules/AdaptiveSoftMax.cpp
index 8ab04ea..8a70596 100644
--- a/flashlight/fl/nn/modules/AdaptiveSoftMax.cpp
+++ b/flashlight/fl/nn/modules/AdaptiveSoftMax.cpp
@@ -66,7 +66,7 @@ Variable AdaptiveSoftMax::getFullLogProb(
     Tensor output({outputSize, batchSize}, inputs.type());
 
     output(
-        fl::range(0, cutoff_[0] + static_cast<long long>(cutoff_.size()) - 1)
+        fl::range(0, cutoff_[0] + static_cast<int64_t>(cutoff_.size()) - 1)
     ) =
         headOutput.tensor();
 
@@ -115,7 +115,7 @@ Variable AdaptiveSoftMax::predict(const Variable& inputs) const {
 
     auto notInShortlist = (prediction >= cutoff_[0]);
     Variable ret = Variable(prediction, false);
-    if(fl::any(notInShortlist).asScalar<bool>()) {
+    if(fl::any_of(notInShortlist).asScalar<bool>()) {
         headOutput = logSoftmax(headOutput, 0);
         auto logProbTailPositions = getFullLogProb(
             inputsFlattened(fl::span, notInShortlist),
diff --git a/flashlight/fl/nn/modules/Conv2D.cpp b/flashlight/fl/nn/modules/Conv2D.cpp
index 83ec842..427f536 100644
--- a/flashlight/fl/nn/modules/Conv2D.cpp
+++ b/flashlight/fl/nn/modules/Conv2D.cpp
@@ -147,8 +147,8 @@ Variable Conv2D::forward(const Variable& input) {
     if(bias_)
         return conv2d(
             input,
-            params_[0].astype(input.type()),
-            params_[1].astype(input.type()),
+            params_[0].asType(input.type()),
+            params_[1].asType(input.type()),
             xStride_,
             yStride_,
             px,
@@ -161,7 +161,7 @@ Variable Conv2D::forward(const Variable& input) {
     else
         return conv2d(
             input,
-            params_[0].astype(input.type()),
+            params_[0].asType(input.type()),
             xStride_,
             yStride_,
             px,
diff --git a/flashlight/fl/nn/modules/Dropout.cpp b/flashlight/fl/nn/modules/Dropout.cpp
index 8e40f6f..c0b2268 100644
--- a/flashlight/fl/nn/modules/Dropout.cpp
+++ b/flashlight/fl/nn/modules/Dropout.cpp
@@ -26,7 +26,7 @@ std::unique_ptr<Module> Dropout::clone() const {
 }
 
 std::string Dropout::prettyString() const {
-    return "Dropout (" + std::to_string(ratio_) + ")";
+    return std::format("Dropout ({0})", ratio_);
 }
 
 } // namespace fl
diff --git a/flashlight/fl/nn/modules/LayerNorm.cpp b/flashlight/fl/nn/modules/LayerNorm.cpp
index e4dd845..b4f0d47 100644
--- a/flashlight/fl/nn/modules/LayerNorm.cpp
+++ b/flashlight/fl/nn/modules/LayerNorm.cpp
@@ -108,8 +108,8 @@ Variable LayerNorm::forward(const Variable& _input) {
     }
 
     if(affine_) {
-        Variable weight = params_[0].astype(output.type());
-        Variable bias = params_[1].astype(output.type());
+        Variable weight = params_[0].asType(output.type());
+        Variable bias = params_[1].asType(output.type());
         if(axisSize_ != kLnVariableAxisSize) {
             Shape affineDims = input.shape();
             for(int ax : axisComplement_)
@@ -118,8 +118,8 @@ Variable LayerNorm::forward(const Variable& _input) {
                 throw std::invalid_argument(
                     "[LayerNorm] Input size along the norm axis doesn't with axisSize."
                 );
-            weight = moddims(params_[0].astype(output.type()), affineDims);
-            bias = moddims(params_[1].astype(output.type()), affineDims);
+            weight = moddims(params_[0].asType(output.type()), affineDims);
+            bias = moddims(params_[1].asType(output.type()), affineDims);
         }
         output = tileAs(weight, input) * output + tileAs(bias, input);
     }
diff --git a/flashlight/fl/nn/modules/Linear.cpp b/flashlight/fl/nn/modules/Linear.cpp
index d6414bb..e690788 100644
--- a/flashlight/fl/nn/modules/Linear.cpp
+++ b/flashlight/fl/nn/modules/Linear.cpp
@@ -53,10 +53,10 @@ Variable Linear::forward(const Variable& input) {
     if(bias_)
         return linear(
             input,
-            params_[0].astype(input.type()),
-            params_[1].astype(input.type())
+            params_[0].asType(input.type()),
+            params_[1].asType(input.type())
         );
-    return linear(input, params_[0].astype(input.type()));
+    return linear(input, params_[0].asType(input.type()));
 }
 
 void Linear::initialize() {
diff --git a/flashlight/fl/nn/modules/Loss.cpp b/flashlight/fl/nn/modules/Loss.cpp
index 2e38298..7d6de66 100644
--- a/flashlight/fl/nn/modules/Loss.cpp
+++ b/flashlight/fl/nn/modules/Loss.cpp
@@ -166,12 +166,12 @@ Variable AdaptiveSoftMaxLoss::forward(
     // Tail forwawrd
     for(int i = 0; i < cutoff.size() - 1; i++) {
         auto mask = (target >= cutoff[i]) && (target < cutoff[i + 1]);
-        if(!fl::any(mask.tensor()).scalar<char>())
+        if(!fl::any_of(mask.tensor()).scalar<char>())
             continue;
 
         auto indicesArray = fl::nonzero(mask.tensor());
         headTarget =
-            headTarget + (mask * (cutoff[0] + i)).astype(headTarget.type());
+            headTarget + (mask * (cutoff[0] + i)).asType(headTarget.type());
         auto tailTarget = target(indicesArray) - cutoff[i];
         auto selectedInput = embedding(Variable(indicesArray, false), input);
         auto tailOutput = matmul(params_[1 + i * 2], selectedInput);
diff --git a/flashlight/fl/nn/modules/PrecisionCast.cpp b/flashlight/fl/nn/modules/PrecisionCast.cpp
index 1669a5b..e016972 100644
--- a/flashlight/fl/nn/modules/PrecisionCast.cpp
+++ b/flashlight/fl/nn/modules/PrecisionCast.cpp
@@ -18,7 +18,7 @@ std::vector<Variable> PrecisionCast::forward(
 ) {
     std::vector<Variable> outputs;
     for(const auto& input : inputs) {
-        auto output = input.astype(targetType_);
+        auto output = input.asType(targetType_);
         outputs.push_back(output);
     }
     return outputs;
diff --git a/flashlight/fl/nn/modules/RNN.cpp b/flashlight/fl/nn/modules/RNN.cpp
index 3de1125..839e12e 100644
--- a/flashlight/fl/nn/modules/RNN.cpp
+++ b/flashlight/fl/nn/modules/RNN.cpp
@@ -80,9 +80,9 @@ std::vector<Variable> RNN::forward(const std::vector<Variable>& inputs) {
     auto rnnRes =
         rnn(
             input,
-            hiddenState.astype(input.type()),
-            cellState.astype(input.type()),
-            params_[0].astype(input.type()),
+            hiddenState.asType(input.type()),
+            cellState.asType(input.type()),
+            params_[0].asType(input.type()),
             hiddenSize_,
             numLayers_,
             mode_,
diff --git a/flashlight/fl/runtime/DeviceType.cpp b/flashlight/fl/runtime/DeviceType.cpp
index 94bb697..e97a3b2 100644
--- a/flashlight/fl/runtime/DeviceType.cpp
+++ b/flashlight/fl/runtime/DeviceType.cpp
@@ -9,16 +9,7 @@
 
 namespace fl {
 
-std::string deviceTypeToString(const DeviceType type) {
-    switch(type) {
-        case DeviceType::x64: return "x64";
-        case DeviceType::CUDA: return "CUDA";
-    }
-}
-
-std::ostream& operator<<(std::ostream& os, const DeviceType& type) { return os << deviceTypeToString(type); }
-
-const std::unordered_set<DeviceType>& getDeviceTypes() {
+std::unordered_set<DeviceType> const& getDeviceTypes() {
     static std::unordered_set<DeviceType> types = {
         DeviceType::x64,
         DeviceType::CUDA
diff --git a/flashlight/fl/runtime/DeviceType.h b/flashlight/fl/runtime/DeviceType.h
index 7f906e7..7f1ac53 100644
--- a/flashlight/fl/runtime/DeviceType.h
+++ b/flashlight/fl/runtime/DeviceType.h
@@ -7,7 +7,9 @@
 
 #pragma once
 
+#include <array>
 #include <ostream>
+#include <span>
 #include <string>
 #include <unordered_set>
 
@@ -20,10 +22,41 @@ namespace fl {
  * NOTE update `fl::getAllDeviceTypes` after changing enum values.
  */
 enum class DeviceType {
-    x64,
+    DEVICE_TYPES_FIRST,
+    x64 = DEVICE_TYPES_FIRST,
     CUDA,
+    DEVICE_TYPES_SIZE,
 };
 
+namespace detail {
+    [[nodiscard]] constexpr auto to_index(DeviceType t) { return static_cast<std::underlying_type_t<DeviceType>>(t); }
+
+    [[nodiscard]] constexpr auto device_types_size() { return to_index(DeviceType::DEVICE_TYPES_SIZE); }
+
+    constexpr std::array DEVICE_TYPES = [] {
+        std::array<DeviceType, static_cast<size_t>(DeviceType::DEVICE_TYPES_SIZE)> types{};
+
+        for(auto i = to_index(DeviceType::DEVICE_TYPES_FIRST); i < types.size(); i++)
+            types[i] = static_cast<DeviceType>(i);
+
+        return types;
+    }();
+}
+
+
+/**
+ * Gets string representation of device type
+ *
+ * @return std::string_view to constexpr string literal
+ */
+[[nodiscard]] FL_API constexpr std::string_view to_string(DeviceType e) {
+    switch(e) {
+        case DeviceType::x64: return "x64";
+        case DeviceType::CUDA: return "CUDA";
+        default: return "unknown";
+    }
+}
+
 #if FL_BACKEND_CUDA
 constexpr DeviceType kDefaultDeviceType = DeviceType::CUDA;
 #else
@@ -31,22 +64,28 @@ constexpr DeviceType kDefaultDeviceType = DeviceType::x64;
 #endif
 
 /**
- * Return a readable string representation of the given device type.
- *
- * @return a string that represents the given device type.
+ * @deprecated use @ref fl::to_string(DeviceType) instead
  */
-FL_API std::string deviceTypeToString(const DeviceType type);
+FL_API inline std::string deviceTypeToString(DeviceType const type) { return std::string{to_string(type)}; }
+
+
 
 /**
  * Output a string representation of `type` to `os`.
  */
-FL_API std::ostream& operator<<(std::ostream& os, const DeviceType& type);
+FL_API inline std::ostream& operator<<(std::ostream& os, DeviceType const& type) { return (os << to_string(type)); }
 
 /**
  * Returns all device types.
  *
- * @return an immutable reference to a set of all device types.
+ * @return span of immutable device types.
  */
-FL_API const std::unordered_set<DeviceType>& getDeviceTypes();
+[[nodiscard]] FL_API constexpr std::span<DeviceType const> device_types() { return detail::DEVICE_TYPES; }
+
+/**
+ * @deprecated use @ref device_types() instead
+ */
+FL_API std::unordered_set<DeviceType> const& getDeviceTypes();
+
 
 } // namespace fl
diff --git a/flashlight/fl/tensor/CMakeLists.txt b/flashlight/fl/tensor/CMakeLists.txt
index 18fff1b..642583b 100644
--- a/flashlight/fl/tensor/CMakeLists.txt
+++ b/flashlight/fl/tensor/CMakeLists.txt
@@ -95,14 +95,9 @@ if(FL_USE_CUDA)
   endif()
 
   # Link CUDA components
-  if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.17)
-    find_package(CUDAToolkit REQUIRED cublas)
-    target_link_libraries(flashlight PRIVATE CUDA::cublas)
-  else()
-    # Remove old branch when requiring CMake >= 3.17
-    target_link_libraries(flashlight PRIVATE ${CUDA_LIBRARIES})
-    target_include_directories(flashlight PRIVATE ${CUDA_INCLUDE_DIRS})
-  endif()
+  find_package(CUDAToolkit REQUIRED cublas)
+  target_link_libraries(flashlight PRIVATE CUDA::cublas)
+
 
   if(FL_BUILD_PROFILING)
     # Try to find NVTX
diff --git a/flashlight/fl/tensor/Shape.cpp b/flashlight/fl/tensor/Shape.cpp
index c0108ec..bb742fb 100644
--- a/flashlight/fl/tensor/Shape.cpp
+++ b/flashlight/fl/tensor/Shape.cpp
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #include "flashlight/fl/tensor/Shape.h"
@@ -15,68 +15,58 @@
 
 namespace fl {
 
-Shape::Shape(std::vector<Dim> d) : dims_(std::move(d)) {}
+Shape::Shape(std::vector<Dim> d) : _dims(std::move(d)) {}
 Shape::Shape(std::initializer_list<Dim> d) : Shape(std::vector<Dim>(d)) {}
 
-const Dim kEmptyShapeNumberOfElements = 1;
+Dim const kEmptyShapeNumberOfElements = 1;
 
-void Shape::checkDimsOrThrow(const size_t dim) const {
+void Shape::checkDimsOrThrow(size_t const dim) const {
     if(dim > ndim() - 1) {
         std::stringstream ss;
         ss << "Shape index " << std::to_string(dim)
-           << " out of bounds for shape with " << std::to_string(dims_.size())
-           << " dimensions.";
+            << " out of bounds for shape with " << std::to_string(_dims.size())
+            << " dimensions.";
         throw std::invalid_argument(ss.str());
     }
 }
 
 Dim Shape::elements() const {
-    if(dims_.empty())
+    if(_dims.empty())
         return kEmptyShapeNumberOfElements;
-    return std::accumulate(dims_.begin(), dims_.end(), static_cast<Dim>(1), std::multiplies<Dim>());
+    return std::accumulate(_dims.begin(), _dims.end(), static_cast<Dim>(1), std::multiplies<Dim>());
 }
 
-int Shape::ndim() const {
-    return dims_.size();
-}
+int Shape::ndim() const { return _dims.size(); }
 
-Dim Shape::dim(const size_t dim) const {
+Dim Shape::dim(size_t const dim) const {
     checkDimsOrThrow(dim);
-    return dims_[dim];
+    return _dims[dim];
 }
 
-Dim& Shape::operator[](const size_t dim) {
+Dim& Shape::operator[](size_t const dim) {
     checkDimsOrThrow(dim);
-    return dims_[dim];
+    return _dims[dim];
 }
 
-const Dim& Shape::operator[](const size_t dim) const {
+Dim const& Shape::operator[](size_t const dim) const {
     checkDimsOrThrow(dim);
-    return dims_[dim];
+    return _dims[dim];
 }
 
-bool Shape::operator==(const Shape& other) const {
-    return dims_ == other.dims_;
-}
+bool Shape::operator==(Shape const& other) const { return _dims == other._dims; }
 
-bool Shape::operator!=(const Shape& other) const {
-    return !(this->operator==(other));
-}
+bool Shape::operator!=(Shape const& other) const { return !(this->operator==(other)); }
 
-bool Shape::operator==(const std::initializer_list<Dim>& other) const {
-    return dims_.size() == other.size()
-           && std::equal(std::begin(dims_), std::end(dims_), std::begin(other));
+bool Shape::operator==(std::initializer_list<Dim> const& other) const {
+    return _dims.size() == other.size()
+        && std::equal(std::begin(_dims), std::end(_dims), std::begin(other));
 }
 
-bool Shape::operator!=(const std::initializer_list<Dim>& other) const {
-    return !(this->operator==(other));
-}
+bool Shape::operator!=(std::initializer_list<Dim> const& other) const { return !(this->operator==(other)); }
 
-const std::vector<Dim>& Shape::get() const {
-    return dims_;
-}
+std::vector<Dim> const& Shape::get() const { return _dims; }
 
-std::vector<Dim>& Shape::get() { return dims_; };
+std::vector<Dim>& Shape::get() { return _dims; };
 
 std::string Shape::toString() const {
     std::stringstream ss;
@@ -87,7 +77,7 @@ std::string Shape::toString() const {
     return ss.str();
 }
 
-std::ostream& operator<<(std::ostream& ostr, const Shape& s) {
+std::ostream& operator<<(std::ostream& ostr, Shape const& s) {
     ostr << s.toString();
     return ostr;
 }
diff --git a/flashlight/fl/tensor/Shape.h b/flashlight/fl/tensor/Shape.h
index 7f6ab41..ee13947 100644
--- a/flashlight/fl/tensor/Shape.h
+++ b/flashlight/fl/tensor/Shape.h
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #pragma once
@@ -18,7 +18,8 @@
 namespace fl {
 
 // The type of a dimension.
-using Dim = long long;
+using dim_t = int64_t;
+using Dim = dim_t;
 
 /**
  * An object describing the dimensions of a tensor.
@@ -44,13 +45,13 @@ using Dim = long long;
 class FL_API Shape {
     // Storage for the dimension values. Defaults to an empty Shape {0}, whereas
     // {} is a scalar shape.
-    std::vector<Dim> dims_;
+    std::vector<Dim> _dims;
 
     /**
      * Check if a dimension is valid (i.e. in bounds) given the current size of
      * the shape. If not valid, throws an exception.
      */
-    void checkDimsOrThrow(const size_t dim) const;
+    void checkDimsOrThrow(size_t dim) const;
 
 public:
     Shape() = default;
@@ -72,7 +73,8 @@ class FL_API Shape {
     /**
      * Initialize a Shape via an initializer list.
      */
-    /* implicit */ Shape(std::initializer_list<Dim> d);
+    /* implicit */
+    Shape(std::initializer_list<Dim> d);
 
     /**
      * @return the number of elements in a tensor that has the given shape.
@@ -90,30 +92,30 @@ class FL_API Shape {
      *
      * @return the number of elements at the given dimension
      */
-    Dim dim(const size_t dim) const;
+    Dim dim(size_t dim) const;
 
     /**
      * Returns a reference to the given index
      */
-    Dim& operator[](const size_t dim);
-    const Dim& operator[](const size_t dim) const;
+    Dim& operator[](size_t dim);
+    Dim const& operator[](size_t dim) const;
 
     /**
      * Compares two shapes. Returns true if their dim vectors are equal.
      */
-    bool operator==(const Shape& other) const;
-    bool operator!=(const Shape& other) const;
+    bool operator==(Shape const& other) const;
+    bool operator!=(Shape const& other) const;
 
     /**
      * Compare a shape to an initializer list.
      */
-    bool operator==(const std::initializer_list<Dim>& other) const;
-    bool operator!=(const std::initializer_list<Dim>& other) const;
+    bool operator==(std::initializer_list<Dim> const& other) const;
+    bool operator!=(std::initializer_list<Dim> const& other) const;
 
     /**
-     * Gets a reference to the underying dims vector.
+     * Gets a reference to the underlying dims vector.
      */
-    const std::vector<Dim>& get() const;
+    std::vector<Dim> const& get() const;
     std::vector<Dim>& get();
 
     /**
@@ -125,6 +127,52 @@ class FL_API Shape {
 /**
  * Write a shape representation to an output stream.
  */
-FL_API std::ostream& operator<<(std::ostream& ostr, const Shape& s);
+FL_API std::ostream& operator<<(std::ostream& ostr, Shape const& s);
+
+
+/**
+ * Composes two shapes with the given operation. 
+ * @param first shape
+ * @param second shape
+ * @tparam Op to apply to elements
+ * @tparam ExtendVal shapes of unequal size will be implicitly extended with this
+ * @return element wise composition
+ */
+template<auto Op, Dim ExtendVal = 0>
+FL_API Shape element_compose_op(Shape const& first, Shape const& second) {
+    auto& large = first.ndim() < second.ndim() ? second : first;
+    auto const outDim = large.ndim();
+    auto const sharedDims = std::min(first.ndim(), second.ndim());
+
+    std::vector<Dim> resultData(outDim);
+
+
+    for(int i = 0; i < sharedDims; i++)
+        resultData[i] = Op(first[i], second[i]);
+
+    for(int i = sharedDims; i < outDim; i++)
+        resultData[i] = Op(large[i], ExtendVal);
+
+    return Shape{resultData};
+}
+
+
+/**
+ * Performs element wise max.
+ * @param first shape
+ * @param second shape
+ * @return element wise max composition
+ * @details shapes of unequal size will be extended with 0
+ */
+FL_API inline Shape max(Shape const& first, Shape const& second) {
+    constexpr auto max_op = [](Dim x, Dim y) { return std::max(x, y); };
+
+    if(first.ndim() == 0)
+        return second;
+    if(second.ndim() == 0)
+        return first;
+
+    return element_compose_op<max_op>(first, second);
+}
 
 } // namespace fl
diff --git a/flashlight/fl/tensor/TensorBackend.cpp b/flashlight/fl/tensor/TensorBackend.cpp
index 59dd817..eb1126e 100644
--- a/flashlight/fl/tensor/TensorBackend.cpp
+++ b/flashlight/fl/tensor/TensorBackend.cpp
@@ -10,11 +10,11 @@
 namespace fl {
 namespace detail {
 
-    bool areBackendsEqual(const Tensor& a, const Tensor& b) { return a.backendType() == b.backendType(); }
+    bool areBackendsEqual(Tensor const& a, Tensor const& b) { return a.backendType() == b.backendType(); }
 
 } // namespace detail
 
-bool TensorBackend::isDataTypeSupported(const fl::dtype& dtype) const {
+bool TensorBackend::isDataTypeSupported(fl::dtype const& dtype) const {
     bool supported = this->supportsDataType(dtype);
     for(auto& p : extensions_)
         supported &= p.second->isDataTypeSupported(dtype);
@@ -22,75 +22,77 @@ bool TensorBackend::isDataTypeSupported(const fl::dtype& dtype) const {
 }
 
 Tensor TensorBackend::clip(
-    const Tensor& tensor,
-    const Tensor& low,
-    const double& high
+    Tensor const& tensor,
+    Tensor const& low,
+    double const& high
 ) {
     return clip(
         tensor,
         low,
-        full(tensor.shape(), high, dtype_traits<double>::ctype)
+        full(tensor.shape(), high, tensor.type())
     );
 }
 
 Tensor TensorBackend::clip(
-    const Tensor& tensor,
-    const double& low,
-    const Tensor& high
+    Tensor const& tensor,
+    double const& low,
+    Tensor const& high
 ) {
     return clip(
         tensor,
-        full(tensor.shape(), low, dtype_traits<double>::ctype),
+        // TODO review, truncated to float in original impl
+        full(tensor.shape(), low, tensor.type()),
         high
     );
 }
 
 Tensor TensorBackend::clip(
-    const Tensor& tensor,
-    const double& low,
-    const double& high
+    Tensor const& tensor,
+    double const& low,
+    double const& high
 ) {
     return clip(
         tensor,
-        full(tensor.shape(), low, dtype_traits<double>::ctype),
-        full(tensor.shape(), high, dtype_traits<double>::ctype)
+        // TODO review, truncated to float in original impl
+        full(tensor.shape(), low, tensor.type()),
+        full(tensor.shape(), high, tensor.type())
     );
 }
 
 Tensor TensorBackend::where(
-    const Tensor& condition,
-    const Tensor& x,
-    const double& y
+    Tensor const& condition,
+    Tensor const& x,
+    double const& y
 ) { return where(condition, x, full(condition.shape(), y, x.type())); }
 
 Tensor TensorBackend::where(
-    const Tensor& condition,
-    const double& x,
-    const Tensor& y
+    Tensor const& condition,
+    double const& x,
+    Tensor const& y
 ) { return where(condition, full(condition.shape(), x, y.type()), y); }
 
-Tensor TensorBackend::minimum(const Tensor& lhs, const double& rhs) {
-    return minimum(lhs, full(lhs.shape(), rhs, dtype_traits<double>::ctype));
+Tensor TensorBackend::minimum(Tensor const& lhs, double const& rhs) {
+    return minimum(lhs, full(lhs.shape(), rhs, lhs.type()));
 }
 
-Tensor TensorBackend::minimum(const double& lhs, const Tensor& rhs) {
-    return minimum(full(rhs.shape(), lhs, dtype_traits<double>::ctype), rhs);
+Tensor TensorBackend::minimum(double const& lhs, Tensor const& rhs) {
+    return minimum(full(rhs.shape(), lhs, rhs.type()), rhs);
 }
 
-Tensor TensorBackend::maximum(const Tensor& lhs, const double& rhs) {
-    return maximum(lhs, full(lhs.shape(), rhs, dtype_traits<double>::ctype));
+Tensor TensorBackend::maximum(Tensor const& lhs, double const& rhs) {
+    return maximum(lhs, full(lhs.shape(), rhs, lhs.type()));
 }
 
-Tensor TensorBackend::maximum(const double& lhs, const Tensor& rhs) {
-    return maximum(full(rhs.shape(), lhs, dtype_traits<double>::ctype), rhs);
+Tensor TensorBackend::maximum(double const& lhs, Tensor const& rhs) {
+    return maximum(full(rhs.shape(), lhs, rhs.type()), rhs);
 }
 
-Tensor TensorBackend::power(const Tensor& lhs, const double& rhs) {
-    return power(lhs, full(lhs.shape(), rhs, dtype_traits<double>::ctype));
+Tensor TensorBackend::power(Tensor const& lhs, double const& rhs) {
+    return power(lhs, full(lhs.shape(), rhs, lhs.type()));
 }
 
-Tensor TensorBackend::power(const double& lhs, const Tensor& rhs) {
-    return power(full(rhs.shape(), lhs, dtype_traits<double>::ctype), rhs);
+Tensor TensorBackend::power(double const& lhs, Tensor const& rhs) {
+    return power(full(rhs.shape(), lhs, rhs.type()), rhs);
 }
 
 } // namespace fl
diff --git a/flashlight/fl/tensor/TensorBackend.h b/flashlight/fl/tensor/TensorBackend.h
index cd7c250..da77b76 100644
--- a/flashlight/fl/tensor/TensorBackend.h
+++ b/flashlight/fl/tensor/TensorBackend.h
@@ -1,10 +1,9 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
-
 #pragma once
 
 #include <memory>
@@ -40,18 +39,18 @@ class TensorBackend {
     virtual TensorBackendType backendType() const = 0;
 
     /* -------------------------- Compute Functions -------------------------- */
-    virtual void eval(const Tensor& tensor) = 0;
-    virtual bool supportsDataType(const fl::dtype& dtype) const = 0;
+    virtual void eval(Tensor const& tensor) = 0;
+    virtual bool supportsDataType(fl::dtype const& dtype) const = 0;
     // Memory Management
-    virtual void getMemMgrInfo(const char* msg, const int deviceId, std::ostream* ostream) = 0;
+    virtual void getMemMgrInfo(char const* msg, int deviceId, std::ostream* ostream) = 0;
     virtual void setMemMgrLogStream(std::ostream* stream) = 0;
-    virtual void setMemMgrLoggingEnabled(const bool enabled) = 0;
-    virtual void setMemMgrFlushInterval(const size_t interval) = 0;
+    virtual void setMemMgrLoggingEnabled(bool enabled) = 0;
+    virtual void setMemMgrFlushInterval(size_t interval) = 0;
 
     /* -------------------------- Rand Functions -------------------------- */
-    virtual void setSeed(const int seed) = 0;
-    virtual Tensor randn(const Shape& shape, dtype type) = 0;
-    virtual Tensor rand(const Shape& shape, dtype type) = 0;
+    virtual void setSeed(int seed) = 0;
+    virtual Tensor randn(Shape const& shape, dtype type) = 0;
+    virtual Tensor rand(Shape const& shape, dtype type) = 0;
 
     /* --------------------------- Tensor Operators ---------------------------
      * For operator documentation and expected behavior, see TensorBase.h.
@@ -75,75 +74,75 @@ class TensorBackend {
     FL_CREATE_FUN_LITERAL_BACKEND_DECL(const unsigned short&);
 #undef FL_CREATE_FUN_LITERAL_BACKEND_DECL
 
-    virtual Tensor identity(const Dim dim, const dtype type) = 0;
-    virtual Tensor arange(const Shape& shape, const Dim seqDim, const dtype type) = 0;
-    virtual Tensor iota(const Shape& dims, const Shape& tileDims, const dtype type) = 0;
+    virtual Tensor identity(Dim dim, dtype type) = 0;
+    virtual Tensor arange(Shape const& shape, Dim seqDim, dtype type) = 0;
+    virtual Tensor iota(Shape const& dims, Shape const& tileDims, dtype type) = 0;
 
     /************************ Shaping and Indexing *************************/
-    virtual Tensor reshape(const Tensor& tensor, const Shape& shape) = 0;
+    virtual Tensor reshape(Tensor const& tensor, Shape const& shape) = 0;
     virtual Tensor transpose(
-        const Tensor& tensor,
-        const Shape& axes /* = {} */
+        Tensor const& tensor,
+        Shape const& axes /* = {} */
     ) = 0;
-    virtual Tensor tile(const Tensor& tensor, const Shape& shape) = 0;
+    virtual Tensor tile(Tensor const& tensor, Shape const& shape) = 0;
     virtual Tensor concatenate(
-        const std::vector<Tensor>& tensors,
-        const unsigned axis
+        std::vector<Tensor> const& tensors,
+        unsigned axis
     ) = 0;
-    virtual Tensor nonzero(const Tensor& tensor) = 0;
+    virtual Tensor nonzero(Tensor const& tensor) = 0;
     virtual Tensor pad(
-        const Tensor& input,
-        const std::vector<std::pair<int, int>>& padWidths,
-        const PadType type
+        Tensor const& input,
+        std::vector<std::pair<int, int>> const& padWidths,
+        PadType type
     ) = 0;
 
     /************************** Unary Operators ***************************/
-    virtual Tensor exp(const Tensor& tensor) = 0;
-    virtual Tensor log(const Tensor& tensor) = 0;
-    virtual Tensor negative(const Tensor& tensor) = 0;
-    virtual Tensor logicalNot(const Tensor& tensor) = 0;
-    virtual Tensor log1p(const Tensor& tensor) = 0;
-    virtual Tensor sin(const Tensor& tensor) = 0;
-    virtual Tensor cos(const Tensor& tensor) = 0;
-    virtual Tensor sqrt(const Tensor& tensor) = 0;
-    virtual Tensor tanh(const Tensor& tensor) = 0;
-    virtual Tensor floor(const Tensor& tensor) = 0;
-    virtual Tensor ceil(const Tensor& tensor) = 0;
-    virtual Tensor rint(const Tensor& tensor) = 0;
-    virtual Tensor absolute(const Tensor& tensor) = 0;
-    virtual Tensor sigmoid(const Tensor& tensor) = 0;
-    virtual Tensor erf(const Tensor& tensor) = 0;
-    virtual Tensor flip(const Tensor& tensor, const unsigned dim) = 0;
-    virtual Tensor clip(const Tensor& tensor, const Tensor& low, const Tensor& high) = 0;
-    virtual Tensor clip(const Tensor& tensor, const Tensor& low, const double& high);
-    virtual Tensor clip(const Tensor& tensor, const double& low, const Tensor& high);
-    virtual Tensor clip(const Tensor& tensor, const double& low, const double& high);
-    virtual Tensor roll(const Tensor& tensor, const int shift, const unsigned axis) = 0;
-    virtual Tensor isnan(const Tensor& tensor) = 0;
-    virtual Tensor isinf(const Tensor& tensor) = 0;
-    virtual Tensor sign(const Tensor& tensor) = 0;
-    virtual Tensor tril(const Tensor& tensor) = 0;
-    virtual Tensor triu(const Tensor& tensor) = 0;
-    virtual Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y) = 0;
-    virtual Tensor where(const Tensor& condition, const Tensor& x, const double& y);
-    virtual Tensor where(const Tensor& condition, const double& x, const Tensor& y);
+    virtual Tensor exp(Tensor const& tensor) = 0;
+    virtual Tensor log(Tensor const& tensor) = 0;
+    virtual Tensor negative(Tensor const& tensor) = 0;
+    virtual Tensor logicalNot(Tensor const& tensor) = 0;
+    virtual Tensor log1p(Tensor const& tensor) = 0;
+    virtual Tensor sin(Tensor const& tensor) = 0;
+    virtual Tensor cos(Tensor const& tensor) = 0;
+    virtual Tensor sqrt(Tensor const& tensor) = 0;
+    virtual Tensor tanh(Tensor const& tensor) = 0;
+    virtual Tensor floor(Tensor const& tensor) = 0;
+    virtual Tensor ceil(Tensor const& tensor) = 0;
+    virtual Tensor rint(Tensor const& tensor) = 0;
+    virtual Tensor absolute(Tensor const& tensor) = 0;
+    virtual Tensor sigmoid(Tensor const& tensor) = 0;
+    virtual Tensor erf(Tensor const& tensor) = 0;
+    virtual Tensor flip(Tensor const& tensor, unsigned dim) = 0;
+    virtual Tensor clip(Tensor const& tensor, Tensor const& low, Tensor const& high) = 0;
+    virtual Tensor clip(Tensor const& tensor, Tensor const& low, double const& high);
+    virtual Tensor clip(Tensor const& tensor, double const& low, Tensor const& high);
+    virtual Tensor clip(Tensor const& tensor, double const& low, double const& high);
+    virtual Tensor roll(Tensor const& tensor, int shift, unsigned axis) = 0;
+    virtual Tensor isnan(Tensor const& tensor) = 0;
+    virtual Tensor isinf(Tensor const& tensor) = 0;
+    virtual Tensor sign(Tensor const& tensor) = 0;
+    virtual Tensor tril(Tensor const& tensor) = 0;
+    virtual Tensor triu(Tensor const& tensor) = 0;
+    virtual Tensor where(Tensor const& condition, Tensor const& x, Tensor const& y) = 0;
+    virtual Tensor where(Tensor const& condition, Tensor const& x, double const& y);
+    virtual Tensor where(Tensor const& condition, double const& x, Tensor const& y);
     virtual void topk(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const unsigned k,
-        const Dim axis,
-        const SortMode sortMode
+        Tensor const& input,
+        unsigned k,
+        Dim axis,
+        SortMode sortMode
     ) = 0;
-    virtual Tensor sort(const Tensor& input, const Dim axis, const SortMode sortMode) = 0;
+    virtual Tensor sort(Tensor const& input, Dim axis, SortMode sortMode) = 0;
     virtual void sort(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const Dim axis,
-        const SortMode sortMode
+        Tensor const& input,
+        Dim axis,
+        SortMode sortMode
     ) = 0;
-    virtual Tensor argsort(const Tensor& input, const Dim axis, const SortMode sortMode) = 0;
+    virtual Tensor argsort(Tensor const& input, Dim axis, SortMode sortMode) = 0;
 
     /************************** Binary Operators ***************************/
 #define FL_BINARY_OP_TYPE_DECL(FUNC, TYPE)                  \
@@ -191,70 +190,70 @@ class TensorBackend {
 #undef FL_BINARY_OP_TYPE_DECL
 #undef FL_BINARY_OP_LITERALS_DECL
 
-    virtual Tensor minimum(const Tensor& lhs, const Tensor& rhs) = 0;
-    virtual Tensor minimum(const Tensor& lhs, const double& rhs);
-    virtual Tensor minimum(const double& lhs, const Tensor& rhs);
-    virtual Tensor maximum(const Tensor& lhs, const Tensor& rhs) = 0;
-    virtual Tensor maximum(const Tensor& lhs, const double& rhs);
-    virtual Tensor maximum(const double& lhs, const Tensor& rhs);
-    virtual Tensor power(const Tensor& lhs, const Tensor& rhs) = 0;
-    virtual Tensor power(const Tensor& lhs, const double& rhs);
-    virtual Tensor power(const double& lhs, const Tensor& rhs);
+    virtual Tensor minimum(Tensor const& lhs, Tensor const& rhs) = 0;
+    virtual Tensor minimum(Tensor const& lhs, double const& rhs);
+    virtual Tensor minimum(double const& lhs, Tensor const& rhs);
+    virtual Tensor maximum(Tensor const& lhs, Tensor const& rhs) = 0;
+    virtual Tensor maximum(Tensor const& lhs, double const& rhs);
+    virtual Tensor maximum(double const& lhs, Tensor const& rhs);
+    virtual Tensor power(Tensor const& lhs, Tensor const& rhs) = 0;
+    virtual Tensor power(Tensor const& lhs, double const& rhs);
+    virtual Tensor power(double const& lhs, Tensor const& rhs);
 
     /******************************* BLAS ********************************/
     virtual Tensor matmul(
-        const Tensor& lhs,
-        const Tensor& rhs,
+        Tensor const& lhs,
+        Tensor const& rhs,
         MatrixProperty lhsProp,
         MatrixProperty rhsProp
     ) = 0;
 
     /************************** Reductions ***************************/
-    virtual Tensor amin(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
-    virtual Tensor amax(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
+    virtual Tensor amin(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
+    virtual Tensor amax(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
     virtual void min(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const unsigned axis,
-        const bool keepDims
+        Tensor const& input,
+        unsigned axis,
+        bool keepDims
     ) = 0;
     virtual void max(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const unsigned axis,
-        const bool keepDims
+        Tensor const& input,
+        unsigned axis,
+        bool keepDims
     ) = 0;
-    virtual Tensor sum(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
-    virtual Tensor cumsum(const Tensor& input, const unsigned axis) = 0;
-    virtual Tensor argmax(const Tensor& input, const unsigned axis, const bool keepDims) = 0;
-    virtual Tensor argmin(const Tensor& input, const unsigned axis, const bool keepDims) = 0;
-    virtual Tensor mean(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
-    virtual Tensor median(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
+    virtual Tensor sum(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
+    virtual Tensor cumsum(Tensor const& input, unsigned axis) = 0;
+    virtual Tensor argmax(Tensor const& input, unsigned axis, bool keepDims) = 0;
+    virtual Tensor argmin(Tensor const& input, unsigned axis, bool keepDims) = 0;
+    virtual Tensor mean(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
+    virtual Tensor median(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
     virtual Tensor var(
-        const Tensor& input,
-        const std::vector<int>& axes,
+        Tensor const& input,
+        std::vector<int> const& axes,
         bool bias,
-        const bool keepDims
+        bool keepDims
     ) = 0;
-    virtual Tensor std(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
+    virtual Tensor std(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
     virtual Tensor norm(
-        const Tensor& input,
-        const std::vector<int>& axes,
+        Tensor const& input,
+        std::vector<int> const& axes,
         double p,
-        const bool keepDims
+        bool keepDims
     ) = 0;
     virtual Tensor countNonzero(
-        const Tensor& input,
-        const std::vector<int>& axes,
-        const bool keepDims
+        Tensor const& input,
+        std::vector<int> const& axes,
+        bool keepDims
     ) = 0;
-    virtual Tensor any(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
-    virtual Tensor all(const Tensor& input, const std::vector<int>& axes, const bool keepDims) = 0;
+    virtual Tensor any(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
+    virtual Tensor all(Tensor const& input, std::vector<int> const& axes, bool keepDims) = 0;
 
     /************************** Utils ***************************/
-    virtual void print(const Tensor& tensor) = 0;
+    virtual void print(Tensor const& tensor) = 0;
 
     /**
      * Checks if a datatype is supported by a TensorBackend and its registered
@@ -264,7 +263,7 @@ class TensorBackend {
      *
      * @return true if the data type is supported, false otherwise
      */
-    virtual bool isDataTypeSupported(const fl::dtype& dtype) const final;
+    virtual bool isDataTypeSupported(fl::dtype const& dtype) const final;
 
     /********************* Tensor Extensions **********************/
     template<typename T>
diff --git a/flashlight/fl/tensor/TensorBase.cpp b/flashlight/fl/tensor/TensorBase.cpp
index 7fde6f5..928a0e0 100644
--- a/flashlight/fl/tensor/TensorBase.cpp
+++ b/flashlight/fl/tensor/TensorBase.cpp
@@ -1,15 +1,14 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
-
 #include "flashlight/fl/tensor/TensorBase.h"
 
+#include <algorithm>
 #include <stdexcept>
 #include <utility>
-#include <algorithm>
 
 #include "flashlight/fl/tensor/DefaultTensorType.h"
 #include "flashlight/fl/tensor/TensorAdapter.h"
@@ -31,117 +30,81 @@ std::unique_ptr<TensorAdapterBase> Tensor::releaseAdapter() { return std::move(i
 
 Tensor::~Tensor() = default;
 
-Tensor::Tensor(const Tensor& tensor) : impl_(tensor.impl_->clone()) {}
+Tensor::Tensor(Tensor const& tensor) : impl_(tensor.impl_->clone()) {}
 
 Tensor::Tensor(Tensor&& other) noexcept : impl_(std::move(other.impl_)) {}
 
 Tensor::Tensor() : impl_(detail::getDefaultAdapter()) {}
 
 Tensor::Tensor(
-    const Shape& shape,
+    Shape const& shape,
     fl::dtype type,
-    const void* ptr,
+    void const* ptr,
     MemoryLocation memoryLocation
 ) : impl_(detail::getDefaultAdapter(shape, type, ptr, memoryLocation)) {}
 
 Tensor::Tensor(
-    const Dim nRows,
-    const Dim nCols,
-    const Tensor& values,
-    const Tensor& rowIdx,
-    const Tensor& colIdx,
+    Dim const nRows,
+    Dim const nCols,
+    Tensor const& values,
+    Tensor const& rowIdx,
+    Tensor const& colIdx,
     StorageType storageType
-) : impl_(detail::getDefaultAdapter(
-    nRows,
-    nCols,
-    values,
-    rowIdx,
-    colIdx,
-    storageType)) {}
+) : impl_(
+    detail::getDefaultAdapter(
+        nRows,
+        nCols,
+        values,
+        rowIdx,
+        colIdx,
+        storageType
+    )
+) {}
 
 Tensor::Tensor(
-    const Shape& shape,
+    Shape const& shape,
     fl::dtype type /* = fl::dtype::f32 */
-) : impl_(detail::getDefaultAdapter(shape,
-        type)) {}
+) : impl_(detail::getDefaultAdapter(shape, type)) {}
 
-Tensor::Tensor(fl::dtype type) : impl_(detail::getDefaultAdapter(Shape({ 0 }), type)) {}
+Tensor::Tensor(fl::dtype type) : impl_(detail::getDefaultAdapter(Shape({0}), type)) {}
 
-Tensor Tensor::copy() const {
-    return impl_->copy();
-}
+Tensor Tensor::copy() const { return impl_->copy(); }
 
-Tensor Tensor::shallowCopy() const {
-    return impl_->shallowCopy();
-}
+Tensor Tensor::shallowCopy() const { return impl_->shallowCopy(); }
 
-const Shape& Tensor::shape() const {
-    return impl_->shape();
-}
+Shape const& Tensor::shape() const { return impl_->shape(); }
 
-Location Tensor::location() const {
-    return impl_->location();
-}
+Location Tensor::location() const { return impl_->location(); }
 
-size_t Tensor::elements() const {
-    return impl_->shape().elements();
-}
+size_t Tensor::elements() const { return impl_->shape().elements(); }
 
-Dim Tensor::dim(const size_t dim) const {
-    return shape().dim(dim);
-}
+Dim Tensor::dim(size_t const dim) const { return shape().dim(dim); }
 
-int Tensor::ndim() const {
-    return shape().ndim();
-}
+int Tensor::ndim() const { return shape().ndim(); }
 
-bool Tensor::isEmpty() const {
-    return elements() == 0;
-}
+bool Tensor::isEmpty() const { return elements() == 0; }
 
-bool Tensor::hasAdapter() const {
-    return impl_.get() != nullptr;
-}
+bool Tensor::hasAdapter() const { return impl_.get() != nullptr; }
 
-size_t Tensor::bytes() const {
-    return elements() * getTypeSize(type());
-}
+size_t Tensor::bytes() const { return elements() * getTypeSize(type()); }
 
-dtype Tensor::type() const {
-    return impl_->type();
-}
+dtype Tensor::type() const { return impl_->type(); }
 
-bool Tensor::isSparse() const {
-    return impl_->isSparse();
-}
+bool Tensor::isSparse() const { return impl_->isSparse(); }
 
-Tensor Tensor::astype(const dtype type) const {
-    return impl_->astype(type);
-}
+Tensor Tensor::asType(dtype const type) const { return impl_->astype(type); }
 
-Tensor Tensor::operator()(const std::vector<Index>& indices) const {
-    return impl_->index(indices);
-}
+Tensor Tensor::operator()(std::vector<Index> const& indices) const { return impl_->index(indices); }
 
-Tensor Tensor::flatten() const {
-    return impl_->flatten();
-}
+Tensor Tensor::flatten() const { return impl_->flatten(); }
 
-Tensor Tensor::flat(const Index& idx) const {
-    return impl_->flat(idx);
-}
+Tensor Tensor::flat(Index const& idx) const { return impl_->flat(idx); }
 
-Tensor Tensor::asContiguousTensor() const {
-    return impl_->asContiguousTensor();
-}
+Tensor Tensor::asContiguousTensor() const { return impl_->asContiguousTensor(); }
 
-TensorBackendType Tensor::backendType() const {
-    return impl_->backendType();
-}
+TensorBackendType Tensor::backendType() const { return impl_->backendType(); }
 
-TensorBackend& Tensor::backend() const {
-    return impl_->backend();
-}
+TensorBackend& Tensor::backend() const { return impl_->backend(); }
 
 #define FL_CREATE_MEMORY_OPS(TYPE)                                                    \
         template<> FL_API TYPE Tensor::scalar() const {                               \
@@ -151,8 +114,8 @@ TensorBackend& Tensor::backend() const {
             if(type() != dtype_traits<TYPE>::fl_type) {                               \
                 throw std::invalid_argument(                                          \
     "Tensor::scalar: requested type of " +                                            \
-    std::string(dtype_traits<TYPE>::getName()) +                                      \
-    " doesn't match tensor type, which is " + dtypeToString(type())                   \
+    std::string(dtype_traits<TYPE>::name()) +                                         \
+    " doesn't match tensor type, which is " + std::string{to_string(type())}          \
                 );                                                                    \
             }                                                                         \
             TYPE out;                                                                 \
@@ -182,7 +145,7 @@ TensorBackend& Tensor::backend() const {
             if(isEmpty()) {                                                           \
                 return nullptr;                                                       \
             }                                                                         \
-            TYPE* out = reinterpret_cast<TYPE*>(new char[bytes()]);                   \
+            auto* out = reinterpret_cast<TYPE*>(new char[bytes()]);                   \
             impl_->host(out);                                                         \
             return out;                                                               \
         }                                                                             \
@@ -193,6 +156,7 @@ TensorBackend& Tensor::backend() const {
                 impl_->host(ptr);                                                     \
             }                                                                         \
         }
+// TODO this should be normal templates
 FL_CREATE_MEMORY_OPS(int);
 FL_CREATE_MEMORY_OPS(unsigned);
 FL_CREATE_MEMORY_OPS(char);
@@ -226,50 +190,32 @@ template<>
 FL_API void* Tensor::host() const {
     if(isEmpty())
         return nullptr;
-    void* out = reinterpret_cast<void*>(new char[bytes()]);
+    auto* out = reinterpret_cast<void*>(new char[bytes()]);
     impl_->host(out);
     return out;
 }
 
 template<>
-FL_API void Tensor::host(void* ptr) const {
-    impl_->host(ptr);
-}
+FL_API void Tensor::host(void* ptr) const { impl_->host(ptr); }
 #undef FL_CREATE_MEMORY_OPS
 
-void Tensor::unlock() const {
-    impl_->unlock();
-}
+void Tensor::unlock() const { impl_->unlock(); }
 
-bool Tensor::isLocked() const {
-    return impl_->isLocked();
-}
+bool Tensor::isLocked() const { return impl_->isLocked(); }
 
-bool Tensor::isContiguous() const {
-    return impl_->isContiguous();
-}
+bool Tensor::isContiguous() const { return impl_->isContiguous(); }
 
-Shape Tensor::strides() const {
-    return impl_->strides();
-}
+Shape Tensor::strides() const { return impl_->strides(); }
 
-const Stream& Tensor::stream() const {
-    return impl_->stream();
-}
+Stream const& Tensor::stream() const { return impl_->stream(); }
 
 void Tensor::setContext(void* context) { impl_->setContext(context); }
 
-void* Tensor::getContext() const {
-    return impl_->getContext();
-}
+void* Tensor::getContext() const { return impl_->getContext(); }
 
-std::string Tensor::toString() const {
-    return impl_->toString();
-}
+std::string Tensor::toString() const { return impl_->toString(); }
 
-std::ostream& Tensor::operator<<(std::ostream& ostr) const {
-    return impl_->operator<<(ostr);
-}
+std::ostream& Tensor::operator<<(std::ostream& ostr) const { return impl_->operator<<(ostr); }
 
 /******************** Assignment Operators ********************/
 #define FL_ASSIGN_OP_TYPE(OP, FUN, TYPE) \
@@ -311,6 +257,8 @@ FL_ASSIGN_OP(operator/=, inPlaceDivide);
 // Move assignment operator when `this` is a lvalue, e.g., `x = std::move(y)`.
 // In such cases, we let `this` take over the tensor data of `other`.
 Tensor& Tensor::operator=(Tensor&& other) & {
+    if(this == &other)
+        return *this;
     this->impl_ = std::move(other.impl_);
     return *this;
 }
@@ -318,20 +266,29 @@ Tensor& Tensor::operator=(Tensor&& other) & {
 // Move assignment operator when `this` is a rvalue, e.g., `x(0) =
 // std::move(y)`. In such cases, we copy the data from `other` to `this`.
 Tensor& Tensor::operator=(Tensor&& other) && {
+    if(this == &other) 
+        return *this;
+
     this->impl_->assign(other);
     return *this;
 }
 
 // Copy assignment operator when `this` is a lvalue, e.g., `x = y`.
 // In such cases, we let `this` take over the _cloned_ data from `other`.
-Tensor& Tensor::operator=(const Tensor& other) & {
+Tensor& Tensor::operator=(Tensor const& other) & {
+    if(this == &other)
+        return *this;
+
     this->impl_ = other.impl_->clone();
     return *this;
 }
 
 // Copy assignment operator when `this` is a lvalue, e.g., `x(0) = y`.
 // In such cases, we copy the data from `other` to `this`.
-Tensor& Tensor::operator=(const Tensor& other) && {
+Tensor& Tensor::operator=(Tensor const& other) && {
+    if(this == &other)
+        return *this;
+
     this->impl_->assign(other);
     return *this;
 }
@@ -361,54 +318,52 @@ FL_CREATE_FUN_LITERAL_TYPE(const short&);
 FL_CREATE_FUN_LITERAL_TYPE(const unsigned short&);
 #undef FL_CREATE_FUN_LITERAL_TYPE
 
-Tensor identity(const Dim dim, const dtype type) { return defaultTensorBackend().identity(dim, type); }
+Tensor identity(Dim const dim, dtype const type) { return defaultTensorBackend().identity(dim, type); }
 
-#define FL_ARANGE_FUN_DEF(TYPE)                                                              \
-        template<> FL_API Tensor arange(TYPE start, TYPE end, TYPE step, const dtype type) { \
-            return fl::arange({static_cast<long>((end - start) / step)}, 0, type) *          \
+#define FL_ARRANGE_FUN_DEF(TYPE)                                                              \
+        template<> FL_API Tensor arrange(TYPE start, TYPE end, TYPE step, const dtype type) { \
+            return fl::arrange({static_cast<long>((end - start) / step)}, 0, type) *          \
                    step +                                                                    \
                    start;                                                                    \
         }
-FL_ARANGE_FUN_DEF(const double&);
-FL_ARANGE_FUN_DEF(const float&);
-FL_ARANGE_FUN_DEF(const int&);
-FL_ARANGE_FUN_DEF(const unsigned&);
-FL_ARANGE_FUN_DEF(const long&);
-FL_ARANGE_FUN_DEF(const unsigned long&);
-FL_ARANGE_FUN_DEF(const long long&);
-FL_ARANGE_FUN_DEF(const unsigned long long&);
-
-Tensor arange(const Shape& shape, const Dim seqDim, const dtype type) {
+FL_ARRANGE_FUN_DEF(const double&);
+FL_ARRANGE_FUN_DEF(const float&);
+FL_ARRANGE_FUN_DEF(const int&);
+FL_ARRANGE_FUN_DEF(const unsigned&);
+FL_ARRANGE_FUN_DEF(const long&);
+FL_ARRANGE_FUN_DEF(const unsigned long&);
+FL_ARRANGE_FUN_DEF(const long long&);
+FL_ARRANGE_FUN_DEF(const unsigned long long&);
+
+Tensor arrange(Shape const& shape, Dim const seqDim, dtype const type) {
     return defaultTensorBackend().arange(shape, seqDim, type);
 }
 
-Tensor iota(const Shape& dims, const Shape& tileDims, const dtype type) {
+Tensor iota(Shape const& dims, Shape const& tileDims, dtype const type) {
     return defaultTensorBackend().iota(dims, tileDims, type);
 }
 
 /************************ Shaping and Indexing *************************/
 
-Tensor reshape(const Tensor& tensor, const Shape& shape) { return tensor.backend().reshape(tensor, shape); }
+Tensor reshape(Tensor const& tensor, Shape const& shape) { return tensor.backend().reshape(tensor, shape); }
 
-Tensor transpose(const Tensor& tensor, const Shape& axes /* = {} */) {
+Tensor transpose(Tensor const& tensor, Shape const& axes /* = {} */) {
     return tensor.backend().transpose(tensor, axes);
 }
 
-Tensor tile(const Tensor& tensor, const Shape& shape) { return tensor.backend().tile(tensor, shape); }
+Tensor tile(Tensor const& tensor, Shape const& shape) { return tensor.backend().tile(tensor, shape); }
 
-Tensor concatenate(const std::vector<Tensor>& tensors, const unsigned axis) {
+Tensor concatenate(std::vector<Tensor> const& tensors, unsigned const axis) {
     if(tensors.empty())
         throw std::invalid_argument("concatenate: called on empty set of tensors");
 
     // Check all backends match
-    const TensorBackendType b = tensors.front().backendType();
-    const bool matches =
+    TensorBackendType const b = tensors.front().backendType();
+    bool const matches =
         std::all_of(
             tensors.begin(),
             tensors.end(),
-            [b](const Tensor& t) {
-                return t.backendType() == b;
-            }
+            [b](Tensor const& t) { return t.backendType() == b; }
         );
     if(!matches)
         throw std::invalid_argument(
@@ -418,91 +373,91 @@ Tensor concatenate(const std::vector<Tensor>& tensors, const unsigned axis) {
     return tensors.front().backend().concatenate(tensors, axis);
 }
 
-Tensor nonzero(const Tensor& tensor) { return tensor.backend().nonzero(tensor); }
+Tensor nonzero(Tensor const& tensor) { return tensor.backend().nonzero(tensor); }
 
 Tensor pad(
-    const Tensor& input,
-    const std::vector<std::pair<int, int>>& padWidths,
-    const PadType type
+    Tensor const& input,
+    std::vector<std::pair<int, int>> const& padWidths,
+    PadType const type
 ) { return input.backend().pad(input, padWidths, type); }
 
 /************************** Unary Operators ***************************/
-Tensor exp(const Tensor& tensor) { return tensor.backend().exp(tensor); }
+Tensor exp(Tensor const& tensor) { return tensor.backend().exp(tensor); }
 
-Tensor log(const Tensor& tensor) { return tensor.backend().log(tensor); }
+Tensor log(Tensor const& tensor) { return tensor.backend().log(tensor); }
 
-Tensor negative(const Tensor& tensor) { return tensor.backend().negative(tensor); }
+Tensor negative(Tensor const& tensor) { return tensor.backend().negative(tensor); }
 
-Tensor logicalNot(const Tensor& tensor) { return tensor.backend().logicalNot(tensor); }
+Tensor logicalNot(Tensor const& tensor) { return tensor.backend().logicalNot(tensor); }
 
-Tensor log1p(const Tensor& tensor) { return tensor.backend().log1p(tensor); }
+Tensor log1p(Tensor const& tensor) { return tensor.backend().log1p(tensor); }
 
-Tensor sin(const Tensor& tensor) { return tensor.backend().sin(tensor); }
+Tensor sin(Tensor const& tensor) { return tensor.backend().sin(tensor); }
 
-Tensor cos(const Tensor& tensor) { return tensor.backend().cos(tensor); }
+Tensor cos(Tensor const& tensor) { return tensor.backend().cos(tensor); }
 
-Tensor sqrt(const Tensor& tensor) { return tensor.backend().sqrt(tensor); }
+Tensor sqrt(Tensor const& tensor) { return tensor.backend().sqrt(tensor); }
 
-Tensor tanh(const Tensor& tensor) { return tensor.backend().tanh(tensor); }
+Tensor tanh(Tensor const& tensor) { return tensor.backend().tanh(tensor); }
 
-Tensor floor(const Tensor& tensor) { return tensor.backend().floor(tensor); }
+Tensor floor(Tensor const& tensor) { return tensor.backend().floor(tensor); }
 
-Tensor ceil(const Tensor& tensor) { return tensor.backend().ceil(tensor); }
+Tensor ceil(Tensor const& tensor) { return tensor.backend().ceil(tensor); }
 
-Tensor rint(const Tensor& tensor) { return tensor.backend().rint(tensor); }
+Tensor rint(Tensor const& tensor) { return tensor.backend().rint(tensor); }
 
-Tensor absolute(const Tensor& tensor) { return tensor.backend().absolute(tensor); }
+Tensor absolute(Tensor const& tensor) { return tensor.backend().absolute(tensor); }
 
-Tensor sigmoid(const Tensor& tensor) { return tensor.backend().sigmoid(tensor); }
+Tensor sigmoid(Tensor const& tensor) { return tensor.backend().sigmoid(tensor); }
 
-Tensor erf(const Tensor& tensor) { return tensor.backend().erf(tensor); }
+Tensor erf(Tensor const& tensor) { return tensor.backend().erf(tensor); }
 
-Tensor flip(const Tensor& tensor, const unsigned dim) { return tensor.backend().flip(tensor, dim); }
+Tensor flip(Tensor const& tensor, unsigned const dim) { return tensor.backend().flip(tensor, dim); }
 
-Tensor clip(const Tensor& tensor, const Tensor& low, const Tensor& high) {
+Tensor clip(Tensor const& tensor, Tensor const& low, Tensor const& high) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(tensor, low, high);
     return tensor.backend().clip(tensor, low, high);
 }
 
-Tensor clip(const Tensor& tensor, const Tensor& low, const double& high) {
+Tensor clip(Tensor const& tensor, Tensor const& low, double const& high) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(tensor, low);
     return tensor.backend().clip(tensor, low, high);
 }
 
-Tensor clip(const Tensor& tensor, const double& low, const Tensor& high) {
+Tensor clip(Tensor const& tensor, double const& low, Tensor const& high) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(tensor, high);
     return tensor.backend().clip(tensor, low, high);
 }
 
-Tensor clip(const Tensor& tensor, const double& low, const double& high) {
+Tensor clip(Tensor const& tensor, double const& low, double const& high) {
     return tensor.backend().clip(tensor, low, high);
 }
 
-Tensor roll(const Tensor& tensor, const int shift, const unsigned axis) {
+Tensor roll(Tensor const& tensor, int const shift, unsigned const axis) {
     return tensor.backend().roll(tensor, shift, axis);
 }
 
-Tensor isnan(const Tensor& tensor) { return tensor.backend().isnan(tensor); }
+Tensor isnan(Tensor const& tensor) { return tensor.backend().isnan(tensor); }
 
-Tensor isinf(const Tensor& tensor) { return tensor.backend().isinf(tensor); }
+Tensor isinf(Tensor const& tensor) { return tensor.backend().isinf(tensor); }
 
-Tensor sign(const Tensor& tensor) { return tensor.backend().sign(tensor); }
+Tensor sign(Tensor const& tensor) { return tensor.backend().sign(tensor); }
 
-Tensor tril(const Tensor& tensor) { return tensor.backend().tril(tensor); }
+Tensor tril(Tensor const& tensor) { return tensor.backend().tril(tensor); }
 
-Tensor triu(const Tensor& tensor) { return tensor.backend().triu(tensor); }
+Tensor triu(Tensor const& tensor) { return tensor.backend().triu(tensor); }
 
-Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y) {
+Tensor where(Tensor const& condition, Tensor const& x, Tensor const& y) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(condition, x, y);
     return condition.backend().where(condition, x, y);
 }
 
-Tensor where(const Tensor& condition, const Tensor& x, const double& y) {
+Tensor where(Tensor const& condition, Tensor const& x, double const& y) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(condition, x);
     return condition.backend().where(condition, x, y);
 }
 
-Tensor where(const Tensor& condition, const double& x, const Tensor& y) {
+Tensor where(Tensor const& condition, double const& x, Tensor const& y) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(condition, y);
     return condition.backend().where(condition, x, y);
 }
@@ -510,28 +465,28 @@ Tensor where(const Tensor& condition, const double& x, const Tensor& y) {
 void topk(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const unsigned k,
-    const Dim axis,
-    const SortMode sortMode /* = SortMode::Descending */
+    Tensor const& input,
+    unsigned const k,
+    Dim const axis,
+    SortMode const sortMode /* = SortMode::Descending */
 ) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(values, indices, input);
     input.backend().topk(values, indices, input, k, axis, sortMode);
 }
 
-Tensor sort(const Tensor& input, const Dim axis, const SortMode sortMode) {
+Tensor sort(Tensor const& input, Dim const axis, SortMode const sortMode) {
     return input.backend().sort(input, axis, sortMode);
 }
 
 void sort(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const Dim axis,
-    const SortMode sortMode /* = SortMode::Descending */
+    Tensor const& input,
+    Dim const axis,
+    SortMode const sortMode /* = SortMode::Descending */
 ) { return values.backend().sort(values, indices, input, axis, sortMode); }
 
-Tensor argsort(const Tensor& input, const Dim axis, const SortMode sortMode) {
+Tensor argsort(Tensor const& input, Dim const axis, SortMode const sortMode) {
     return input.backend().argsort(input, axis, sortMode);
 }
 
@@ -598,37 +553,37 @@ FL_BINARY_OP_DEF(>>, rShift);
 #undef FL_BINARY_OP_LITERALS_DEF
 #undef FL_BINARY_OP_LITERAL_TYPE_DEF
 
-Tensor minimum(const Tensor& lhs, const Tensor& rhs) {
+Tensor minimum(Tensor const& lhs, Tensor const& rhs) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(lhs, rhs);
     return lhs.backend().minimum(lhs, rhs);
 }
 
-Tensor maximum(const Tensor& lhs, const Tensor& rhs) {
+Tensor maximum(Tensor const& lhs, Tensor const& rhs) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(lhs, rhs);
     return lhs.backend().maximum(lhs, rhs);
 }
 
-Tensor minimum(const Tensor& lhs, const double& rhs) { return lhs.backend().minimum(lhs, rhs); }
+Tensor minimum(Tensor const& lhs, double const& rhs) { return lhs.backend().minimum(lhs, rhs); }
 
-Tensor minimum(const double& lhs, const Tensor& rhs) { return rhs.backend().minimum(lhs, rhs); }
+Tensor minimum(double const& lhs, Tensor const& rhs) { return rhs.backend().minimum(lhs, rhs); }
 
-Tensor maximum(const Tensor& lhs, const double& rhs) { return lhs.backend().maximum(lhs, rhs); }
+Tensor maximum(Tensor const& lhs, double const& rhs) { return lhs.backend().maximum(lhs, rhs); }
 
-Tensor maximum(const double& lhs, const Tensor& rhs) { return rhs.backend().maximum(lhs, rhs); }
+Tensor maximum(double const& lhs, Tensor const& rhs) { return rhs.backend().maximum(lhs, rhs); }
 
-Tensor power(const Tensor& lhs, const Tensor& rhs) {
+Tensor power(Tensor const& lhs, Tensor const& rhs) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(lhs, rhs);
     return lhs.backend().power(lhs, rhs);
 }
 
-Tensor power(const Tensor& lhs, const double& rhs) { return lhs.backend().power(lhs, rhs); }
+Tensor power(Tensor const& lhs, double const& rhs) { return lhs.backend().power(lhs, rhs); }
 
-Tensor power(const double& lhs, const Tensor& rhs) { return rhs.backend().power(lhs, rhs); }
+Tensor power(double const& lhs, Tensor const& rhs) { return rhs.backend().power(lhs, rhs); }
 
 /******************************* BLAS ********************************/
 Tensor matmul(
-    const Tensor& lhs,
-    const Tensor& rhs,
+    Tensor const& lhs,
+    Tensor const& rhs,
     MatrixProperty lhsProp,
     MatrixProperty rhsProp
 ) {
@@ -639,23 +594,23 @@ Tensor matmul(
 /************************** Reductions ***************************/
 
 Tensor amin(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().amin(input, axes, keepDims); }
 
 Tensor amax(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().amax(input, axes, keepDims); }
 
 void min(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const unsigned axis,
-    const bool keepDims
+    Tensor const& input,
+    unsigned const axis,
+    bool const keepDims
 ) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(values, indices, input);
     return input.backend().min(values, indices, input, axis, keepDims);
@@ -664,97 +619,97 @@ void min(
 void max(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const unsigned axis,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    unsigned const axis,
+    bool const keepDims /* = false */
 ) {
     FL_TENSOR_BACKENDS_MATCH_CHECK(values, indices, input);
     return input.backend().max(values, indices, input, axis, keepDims);
 }
 
 Tensor sum(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().sum(input, axes, keepDims); }
 
-Tensor cumsum(const Tensor& input, const unsigned axis) { return input.backend().cumsum(input, axis); }
+Tensor cumsum(Tensor const& input, unsigned const axis) { return input.backend().cumsum(input, axis); }
 
 Tensor argmax(
-    const Tensor& input,
-    const unsigned axis,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    unsigned const axis,
+    bool const keepDims /* = false */
 ) { return input.backend().argmax(input, axis, keepDims); }
 
 Tensor argmin(
-    const Tensor& input,
-    const unsigned axis,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    unsigned const axis,
+    bool const keepDims /* = false */
 ) { return input.backend().argmin(input, axis, keepDims); }
 
 Tensor mean(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().mean(input, axes, keepDims); }
 
 Tensor median(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().median(input, axes, keepDims); }
 
 Tensor var(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool bias,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const bias,
+    bool const keepDims /* = false */
 ) { return input.backend().var(input, axes, bias, keepDims); }
 
 Tensor std(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().std(input, axes, keepDims); }
 
 Tensor norm(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
     double p /* = 2 */,
-    const bool keepDims /* = false */
+    bool const keepDims /* = false */
 ) { return input.backend().norm(input, axes, p, keepDims); }
 
 Tensor countNonzero(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().countNonzero(input, axes, keepDims); }
 
-Tensor any(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+Tensor any_of(
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().any(input, axes, keepDims); }
 
-Tensor all(
-    const Tensor& input,
-    const std::vector<int>& axes /* = {} */,
-    const bool keepDims /* = false */
+Tensor all_of(
+    Tensor const& input,
+    std::vector<int> const& axes /* = {} */,
+    bool const keepDims /* = false */
 ) { return input.backend().all(input, axes, keepDims); }
 
 /************************** Utilities ***************************/
 
-std::ostream& operator<<(std::ostream& ostr, const Tensor& t) {
+std::ostream& operator<<(std::ostream& ostr, Tensor const& t) {
     t.operator<<(ostr);
     return ostr;
 }
 
-void print(const Tensor& tensor) { tensor.backend().print(tensor); }
+void print(Tensor const& tensor) { tensor.backend().print(tensor); }
 
 bool allClose(
-    const fl::Tensor& a,
-    const fl::Tensor& b,
-    const double absTolerance
+    fl::Tensor const& a,
+    fl::Tensor const& b,
+    double const absTolerance
 ) {
     if(a.type() != b.type())
         return false;
@@ -762,28 +717,27 @@ bool allClose(
         return false;
     if(a.elements() == 0 && b.elements() == 0)
         return true;
-    return fl::amax(fl::abs(a - b)).astype(dtype::f64).scalar<double>()
-           < absTolerance;
+
+    auto const diff = fl::amax(fl::abs(a - b)).asType(dtype::f64).scalar<double>();
+
+    return diff < absTolerance;
 }
 
-bool isInvalidArray(const Tensor& tensor) {
-    return fl::any(fl::isnan(tensor)).asScalar<bool>()
-           || fl::any(fl::isinf(tensor)).asScalar<bool>();
+bool isInvalidArray(Tensor const& tensor) {
+    return fl::any_of(fl::isnan(tensor)).asScalar<bool>()
+        || fl::any_of(fl::isinf(tensor)).asScalar<bool>();
 }
 
-std::string tensorBackendTypeToString(const TensorBackendType type) {
+std::string tensorBackendTypeToString(TensorBackendType const type) {
     switch(type) {
-        case TensorBackendType::Stub:
-            return "Stub";
-        case TensorBackendType::Tracer:
-            return "Tracer";
-        case TensorBackendType::ArrayFire:
-            return "ArrayFire";
+        case TensorBackendType::Stub: return "Stub";
+        case TensorBackendType::Tracer: return "Tracer";
+        case TensorBackendType::ArrayFire: return "ArrayFire";
     }
     throw std::runtime_error("Unreachable -- unrecognized tensor backend type");
 }
 
-std::ostream& operator<<(std::ostream& os, const TensorBackendType type) {
+std::ostream& operator<<(std::ostream& os, TensorBackendType const type) {
     os << tensorBackendTypeToString(type);
     return os;
 }
@@ -794,7 +748,7 @@ namespace detail {
 
     std::unique_ptr<TensorAdapterBase> releaseAdapterUnsafe(Tensor& t) { return t.releaseAdapter(); }
 
-    bool areTensorTypesEqual(const Tensor& a, const Tensor& b) { return a.type() == b.type(); }
+    bool areTensorTypesEqual(Tensor const& a, Tensor const& b) { return a.type() == b.type(); }
 
 } // namespace detail
 
diff --git a/flashlight/fl/tensor/TensorBase.h b/flashlight/fl/tensor/TensorBase.h
index 82ae03f..d8baf14 100644
--- a/flashlight/fl/tensor/TensorBase.h
+++ b/flashlight/fl/tensor/TensorBase.h
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #pragma once
@@ -30,7 +30,7 @@ class Tensor;
  */
 
 /// Enum for various tensor backends.
-enum class TensorBackendType {Stub, Tracer, ArrayFire};
+enum class TensorBackendType { Stub, Tracer, ArrayFire };
 
 // See TensorAdapter.h
 class TensorAdapterBase;
@@ -45,12 +45,12 @@ struct Index;
 class Stream;
 
 /// Location of memory or tensors.
-enum class Location {Host, Device};
+enum class Location { Host, Device };
 /// Alias to make it semantically clearer when referring to buffer location
 using MemoryLocation = Location;
 
 /// Tensor storage types.
-enum class StorageType {Dense = 0, CSR = 1, CSC = 2, COO = 3};
+enum class StorageType { Dense = 0, CSR = 1, CSC = 2, COO = 3 };
 
 /* @} */
 
@@ -86,9 +86,9 @@ class FL_API Tensor {
      * compliance with TensorAdapter and is intentionally private.
      */
     Tensor(
-        const Shape& shape,
+        Shape const& shape,
         fl::dtype type,
-        const void* ptr,
+        void const* ptr,
         MemoryLocation memoryLocation
     );
 
@@ -132,13 +132,13 @@ class FL_API Tensor {
      * Copy constructor - calls the implementation-defined copy constructor for
      * the TensorAdapter.
      */
-    Tensor(const Tensor& tensor);
+    Tensor(Tensor const& tensor);
 
     /**
      * Move constructor - moves the pointer to the TensorAdapter - performs no
      * other operations.
      */
-    Tensor(Tensor&& tensor) noexcept;
+    Tensor(Tensor&& other) noexcept;
 
     /**
      * Construct an empty tensor with the default tensor backend's tensor adapter.
@@ -152,7 +152,7 @@ class FL_API Tensor {
      * @param[in] shape the shape of the tensor
      * @param[in] type (optional) the type of the tensor
      */
-    explicit Tensor(const Shape& shape, fl::dtype type = fl::dtype::f32);
+    explicit Tensor(Shape const& shape, fl::dtype type = fl::dtype::f32);
 
     /**
      * Construct an empty tensor of a given type.
@@ -174,11 +174,11 @@ class FL_API Tensor {
      * \todo Expand this API with getters as needed.
      */
     Tensor(
-        const Dim nRows,
-        const Dim nCols,
-        const Tensor& values,
-        const Tensor& rowIdx,
-        const Tensor& colIdx,
+        Dim nRows,
+        Dim nCols,
+        Tensor const& values,
+        Tensor const& rowIdx,
+        Tensor const& colIdx,
         StorageType storageType
     );
 
@@ -212,7 +212,7 @@ class FL_API Tensor {
     template<typename T>
     static Tensor fromVector(std::vector<T> v) {
         return Tensor(
-            {static_cast<long long>(v.size())},
+            {static_cast<int64_t>(v.size())},
             fl::dtype_traits<T>::fl_type,
             v.data(),
             Location::Host
@@ -222,7 +222,7 @@ class FL_API Tensor {
     template<typename T, std::size_t S>
     static Tensor fromArray(std::array<T, S> a) {
         return Tensor(
-            {static_cast<long long>(a.size())},
+            {static_cast<int64_t>(a.size())},
             fl::dtype_traits<T>::fl_type,
             a.data(),
             Location::Host
@@ -239,7 +239,7 @@ class FL_API Tensor {
      * @return a tensor with values and shape as given.
      */
     template<typename T>
-    static Tensor fromBuffer(Shape s, const T* ptr, Location memoryLocation) {
+    static Tensor fromBuffer(Shape s, T const* ptr, Location memoryLocation) {
         return Tensor(s, fl::dtype_traits<T>::fl_type, ptr, memoryLocation);
     }
 
@@ -256,7 +256,7 @@ class FL_API Tensor {
     static Tensor fromBuffer(
         Shape s,
         fl::dtype t,
-        const uint8_t* ptr,
+        uint8_t const* ptr,
         Location memoryLocation
     ) { return Tensor(s, t, ptr, memoryLocation); }
 
@@ -270,7 +270,7 @@ class FL_API Tensor {
      *
      * @return the shape of the tensor
      */
-    const Shape& shape() const;
+    Shape const& shape() const;
 
     /**
      * Get a tensor's location, host or some device.
@@ -293,7 +293,7 @@ class FL_API Tensor {
      *
      * @return the number of elements at the given dimension
      */
-    Dim dim(const size_t dim) const;
+    Dim dim(size_t dim) const;
 
     /**
      * Get the number of directions of the tensor.
@@ -354,7 +354,7 @@ class FL_API Tensor {
      * @return an immutable reference to the stream that contains(ed) the
      * computations which create this tensor.
      */
-    virtual const Stream& stream() const;
+    virtual Stream const& stream() const;
 
     /**
      * Returns a tensor with elements cast as a particular type
@@ -362,7 +362,15 @@ class FL_API Tensor {
      * @param[in] type the type to which to cast the tensor
      * @return a tensor with element-wise cast to the new type
      */
-    Tensor astype(const dtype type) const;
+    Tensor asType(dtype type) const;
+
+
+    /**
+     * @deprecated use @ref Tensor::asType(dtype) const instead
+     */
+    Tensor astype(dtype type) const { return asType(type); }
+
+
 
     /**
      * Index into a tensor using a vector of fl::Index references.
@@ -370,7 +378,7 @@ class FL_API Tensor {
      * @param[in] indices a vector of fl::Index references with which to index.
      * @return an indexed tensor
      */
-    Tensor operator()(const std::vector<Index>& indices) const;
+    Tensor operator()(std::vector<Index> const& indices) const;
 
     /**
      * Index into a tensor using a variable number of fl::Index.
@@ -379,7 +387,7 @@ class FL_API Tensor {
      * @return an indexed tensor
      */
     template<typename... Ts>
-    Tensor operator()(const Ts&... args) const {
+    Tensor operator()(Ts const&... args) const {
         // TODO: add this back if acceptable with C++ 17 ABIs and a nvcc
         // static_assert(
         // std::conjunction<std::is_constructible<Index, Ts>...>::value,
@@ -402,7 +410,7 @@ class FL_API Tensor {
      *
      * @return an indexed, 1D version of this tensor.
      */
-    Tensor flat(const Index& idx) const;
+    Tensor flat(Index const& idx) const;
 
     /**
      * Return a copy (depending on copy-on-write behavior of the underlying
@@ -425,9 +433,7 @@ class FL_API Tensor {
      * @return the tensor adapter.
      */
     template<typename T>
-    T& getAdapter() const {
-        return *static_cast<T*>(impl_.get());
-    }
+    T& getAdapter() const { return *static_cast<T*>(impl_.get()); }
 
     /**
      * Return the TensorBackend associated with this tensor.
@@ -459,35 +465,11 @@ class FL_API Tensor {
      */
     template<typename T>
     T asScalar() const {
-        // Implicitly cast to the requested return type
-        switch(type()) {
-            case dtype::f16:
-                return astype(dtype::f32).scalar<float>();
-            case dtype::f32:
-                return scalar<float>();
-            case dtype::f64:
-                return scalar<double>();
-            case dtype::s32:
-                return scalar<int>();
-            case dtype::u32:
-                return scalar<unsigned int>();
-            case dtype::b8:
-                return scalar<char>();
-            case dtype::u8:
-                return scalar<unsigned char>();
-            case dtype::s64:
-                return scalar<long long>();
-            case dtype::u64:
-                return scalar<unsigned long long>();
-            case dtype::s16:
-                return scalar<short>();
-            case dtype::u16:
-                return scalar<unsigned short>();
-            default:
-                throw std::invalid_argument(
-                    "Tensor::asScaler - no castable type exists."
-                );
-        }
+        if(type() == dtype::f16)
+            return fl::dispatch_dtype<T>(dtype::f32, [t = asType(dtype::f32)]<class U> { return t.scalar<U>(); });
+
+
+        return fl::dispatch_dtype<T>(type(), [&t = *this]<class U> { return t.scalar<U>(); });
     }
 
     /**
@@ -578,9 +560,9 @@ class FL_API Tensor {
      * Stores arbitrary data on a tensor. For internal use/benchmarking only. This
      * may be a no-op for some backends.
      *
-     * @param[in] data a pointer to arbitrary data to pass to a tensor impl.
+     * @param[in] context a pointer to arbitrary data to pass to a tensor impl.
      */
-    void setContext(void* data);
+    void setContext(void* context);
 
     /**
      * Gets arbitrary data stored on a tensor. For internal use/benchmarking only.
@@ -643,8 +625,8 @@ class FL_API Tensor {
      */
     Tensor& operator=(Tensor&& other) &;
     Tensor& operator=(Tensor&& other) &&;
-    Tensor& operator=(const Tensor& other) &;
-    Tensor& operator=(const Tensor& other) &&;
+    Tensor& operator=(Tensor const& other) &;
+    Tensor& operator=(Tensor const& other) &&;
 };
 
 /**
@@ -664,7 +646,7 @@ class FL_API Tensor {
  * @return a tensor of the specified shape filled with the specified value
  */
 template<typename T>
-FL_API Tensor fromScalar(const T& val, const dtype type = dtype_traits<T>::ctype);
+FL_API Tensor fromScalar(T const& val, dtype type = dtype_traits<T>::fl_type);
 
 /**
  * Creates a new Tensor with a given Shape and filled with a particular value.
@@ -677,9 +659,9 @@ FL_API Tensor fromScalar(const T& val, const dtype type = dtype_traits<T>::ctype
  */
 template<typename T>
 FL_API Tensor full(
-    const Shape& dims,
-    const T& val,
-    const dtype type = dtype_traits<T>::ctype
+    Shape const& dims,
+    T const& val,
+    dtype type = dtype_traits<T>::fl_type
 );
 
 /**
@@ -688,7 +670,7 @@ FL_API Tensor full(
  * @param[in] dim the size of the dimension of the matrix (dim x dim)
  * @param[in] type the type of the resulting matrix
  */
-FL_API Tensor identity(const Dim dim, const dtype type = dtype::f32);
+FL_API Tensor identity(Dim dim, dtype type = dtype::f32);
 
 /**
  * Return evenly-spaced values in a given interval. Generate values in the
@@ -702,11 +684,11 @@ FL_API Tensor identity(const Dim dim, const dtype type = dtype::f32);
  * @return a tensor containing the evenly-spaced values
  */
 template<typename T>
-FL_API Tensor arange(
-    const T& start,
-    const T& end,
-    const T& step = 1,
-    const dtype type = dtype_traits<T>::ctype
+FL_API Tensor arrange(
+    T const& start,
+    T const& end,
+    T const& step = 1,
+    dtype type = dtype_traits<T>::fl_type
 );
 
 /**
@@ -721,7 +703,7 @@ FL_API Tensor arange(
  * @return a tensor with the given shape with the sequence along the given
  * dimension, tiled along other dimensions.
  */
-FL_API Tensor arange(const Shape& shape, const Dim seqDim = 0, const dtype type = dtype::f32);
+FL_API Tensor arrange(Shape const& shape, Dim seqDim = 0, dtype type = dtype::f32);
 
 /**
  * Creates a sequence with the range `[0, dims.elements())` sequentially in the
@@ -738,9 +720,9 @@ FL_API Tensor arange(const Shape& shape, const Dim seqDim = 0, const dtype type
  * @return
  */
 FL_API Tensor iota(
-    const Shape& dims,
-    const Shape& tileDims = { 1 },
-    const dtype type = dtype::f32
+    Shape const& dims,
+    Shape const& tileDims = {1},
+    dtype type = dtype::f32
 );
 
 /************************ Shaping and Indexing *************************/
@@ -752,7 +734,7 @@ FL_API Tensor iota(
  * @param[in] shape the new shape for the tensor
  * @return the reshaped tensor
  */
-FL_API Tensor reshape(const Tensor& tensor, const Shape& shape);
+FL_API Tensor reshape(Tensor const& tensor, Shape const& shape);
 
 /**
  * Permute the axes of a tensor. If no arguments are given, reverses the axes of
@@ -764,7 +746,7 @@ FL_API Tensor reshape(const Tensor& tensor, const Shape& shape);
  * argument is not passed, the axes of the input tensor will be reversed.
  * @return the permuted tensor
  */
-FL_API Tensor transpose(const Tensor& tensor, const Shape& axes = {});
+FL_API Tensor transpose(Tensor const& tensor, Shape const& axes = {});
 
 /**
  * Repeat the contents of a tensor a given number of times along specified
@@ -775,7 +757,7 @@ FL_API Tensor transpose(const Tensor& tensor, const Shape& axes = {});
  * tensor
  * @return the tiled tensor
  */
-FL_API Tensor tile(const Tensor& tensor, const Shape& shape);
+FL_API Tensor tile(Tensor const& tensor, Shape const& shape);
 
 /**
  * Join or concatenate tensors together along a particular axis.
@@ -784,7 +766,7 @@ FL_API Tensor tile(const Tensor& tensor, const Shape& shape);
  * @param[in] axis the axis along which to concatenate tensors
  * @return a concatenated tensor
  */
-FL_API Tensor concatenate(const std::vector<Tensor>& tensors, const unsigned axis = 0);
+FL_API Tensor concatenate(std::vector<Tensor> const& tensors, unsigned axis = 0);
 
 /**
  * Join or concatenate tensors together along a particular axis.
@@ -794,7 +776,7 @@ FL_API Tensor concatenate(const std::vector<Tensor>& tensors, const unsigned axi
  * @return a concatenated tensor
  */
 template<typename... Ts>
-Tensor concatenate(unsigned axis, const Ts&... args) {
+Tensor concatenate(unsigned axis, Ts const&... args) {
     std::vector<Tensor> tensors{{args...}};
     return concatenate(tensors, axis);
 }
@@ -806,7 +788,7 @@ Tensor concatenate(unsigned axis, const Ts&... args) {
  * @param[in] tensor input tensor
  * @return a tensor containing the indices of the nonzero elements
  */
-FL_API Tensor nonzero(const Tensor& tensor);
+FL_API Tensor nonzero(Tensor const& tensor);
 
 /// Padding types for the pad operator.
 enum class PadType {
@@ -829,9 +811,9 @@ enum class PadType {
  * @return the padded tensor
  */
 FL_API Tensor pad(
-    const Tensor& input,
-    const std::vector<std::pair<int, int>>& padWidths,
-    const PadType type = PadType::Constant
+    Tensor const& input,
+    std::vector<std::pair<int, int>> const& padWidths,
+    PadType type = PadType::Constant
 );
 
 /************************** Unary Operators ***************************/
@@ -841,8 +823,8 @@ FL_API Tensor pad(
  * @param[in] tensor the input tensor to negate.
  * @return a tensor with elements negated.
  */
-FL_API Tensor negative(const Tensor& tensor);
-inline Tensor operator-(const Tensor& tensor) { return negative(tensor); }
+FL_API Tensor negative(Tensor const& tensor);
+inline Tensor operator-(Tensor const& tensor) { return negative(tensor); }
 
 /**
  * Performs element-wise logical-not on the elements of a tensor
@@ -850,8 +832,8 @@ inline Tensor operator-(const Tensor& tensor) { return negative(tensor); }
  * @param[in] tensor the tensor on which to perform logical not
  * @return a tensor with element-wise logical not of the input
  */
-FL_API Tensor logicalNot(const Tensor& tensor);
-inline Tensor operator!(const Tensor& tensor) { return logicalNot(tensor); }
+FL_API Tensor logicalNot(Tensor const& tensor);
+inline Tensor operator!(Tensor const& tensor) { return logicalNot(tensor); }
 
 /**
  * Compute the element-wise exponential of a tensor
@@ -859,7 +841,7 @@ inline Tensor operator!(const Tensor& tensor) { return logicalNot(tensor); }
  * @param[in] tensor the tensor to exponentiate
  * @return the exponentiated tensor
  */
-FL_API Tensor exp(const Tensor& tensor);
+FL_API Tensor exp(Tensor const& tensor);
 
 /**
  * Compute the element-wise natural logarithm of a tensor
@@ -867,7 +849,7 @@ FL_API Tensor exp(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor log(const Tensor& tensor);
+FL_API Tensor log(Tensor const& tensor);
 
 /**
  * Returns the natural logarithm of one plus the input, element-wise.
@@ -875,7 +857,7 @@ FL_API Tensor log(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor log1p(const Tensor& tensor);
+FL_API Tensor log1p(Tensor const& tensor);
 
 /**
  * Returns the element-wise sine of the input.
@@ -883,7 +865,7 @@ FL_API Tensor log1p(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor sin(const Tensor& tensor);
+FL_API Tensor sin(Tensor const& tensor);
 
 /**
  * Returns the element-wise cosine of the input.
@@ -891,7 +873,7 @@ FL_API Tensor sin(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor cos(const Tensor& tensor);
+FL_API Tensor cos(Tensor const& tensor);
 
 /**
  * Returns the element-wise non-negative square root of the input.
@@ -899,7 +881,7 @@ FL_API Tensor cos(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor sqrt(const Tensor& tensor);
+FL_API Tensor sqrt(Tensor const& tensor);
 
 /**
  * Returns the element-wise hyperbolic tangent of the input.
@@ -907,7 +889,7 @@ FL_API Tensor sqrt(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor tanh(const Tensor& tensor);
+FL_API Tensor tanh(Tensor const& tensor);
 
 /**
  * Returns the element-wise floor of the input.
@@ -915,7 +897,7 @@ FL_API Tensor tanh(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute the floor
  * @return the resulting tensor
  */
-FL_API Tensor floor(const Tensor& tensor);
+FL_API Tensor floor(Tensor const& tensor);
 
 /**
  * Returns the element-wise ceiling of the input.
@@ -923,7 +905,7 @@ FL_API Tensor floor(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute the ceiling
  * @return the resulting tensor
  */
-FL_API Tensor ceil(const Tensor& tensor);
+FL_API Tensor ceil(Tensor const& tensor);
 
 /**
  * Returns the tensor with element-wise rounding to the nearest integer.
@@ -931,7 +913,7 @@ FL_API Tensor ceil(const Tensor& tensor);
  * @param[in] tensor the input tensor
  * @return the resulting tensor
  */
-FL_API Tensor rint(const Tensor& tensor);
+FL_API Tensor rint(Tensor const& tensor);
 
 /**
  * Returns the element-wise absolute value of the input.
@@ -939,10 +921,10 @@ FL_API Tensor rint(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor absolute(const Tensor& tensor);
+FL_API Tensor absolute(Tensor const& tensor);
 
 // \copydoc absolute
-inline Tensor abs(const Tensor& tensor) { return absolute(tensor); }
+inline Tensor abs(Tensor const& tensor) { return absolute(tensor); }
 
 /**
  * Returns the element-wise sigmoid the input:
@@ -951,7 +933,7 @@ inline Tensor abs(const Tensor& tensor) { return absolute(tensor); }
  * @param[in] tensor the tensor on which to compute
  * @return the resulting tensor
  */
-FL_API Tensor sigmoid(const Tensor& tensor);
+FL_API Tensor sigmoid(Tensor const& tensor);
 
 /**
  * Computes the element-wise error function the input: see
@@ -960,7 +942,7 @@ FL_API Tensor sigmoid(const Tensor& tensor);
  * @param[in] tensor the tensor on which to compute
  * @return ther resulting tensor
  */
-FL_API Tensor erf(const Tensor& tensor);
+FL_API Tensor erf(Tensor const& tensor);
 
 /**
  * Flip a Tensor along a specified dimension.
@@ -970,7 +952,7 @@ FL_API Tensor erf(const Tensor& tensor);
  *
  * @return the resulting flipped tensor
  */
-FL_API Tensor flip(const Tensor& tensor, const unsigned dim);
+FL_API Tensor flip(Tensor const& tensor, unsigned dim);
 
 /**
  * Clip (limit) the values of a tensor. Given some interval of values, set
@@ -988,7 +970,7 @@ FL_API Tensor flip(const Tensor& tensor, const unsigned dim);
  * clipping
  * @return a tensor with all values clipped between high and low
  */
-FL_API Tensor clip(const Tensor& tensor, const Tensor& low, const Tensor& high);
+FL_API Tensor clip(Tensor const& tensor, Tensor const& low, Tensor const& high);
 
 /**
  * Clip (limit) the values of a tensor. Given some interval of values, set
@@ -1003,7 +985,7 @@ FL_API Tensor clip(const Tensor& tensor, const Tensor& low, const Tensor& high);
  * @param[in] high a scalar to use as the maximum value in clipping
  * @return a tensor with all values clipped between high and low
  */
-FL_API Tensor clip(const Tensor& tensor, const Tensor& low, const double& high);
+FL_API Tensor clip(Tensor const& tensor, Tensor const& low, double const& high);
 
 /**
  * Clip (limit) the values of a tensor. Given some interval of values, set
@@ -1018,7 +1000,7 @@ FL_API Tensor clip(const Tensor& tensor, const Tensor& low, const double& high);
  * clipping
  * @return a tensor with all values clipped between high and low
  */
-FL_API Tensor clip(const Tensor& tensor, const double& low, const Tensor& high);
+FL_API Tensor clip(Tensor const& tensor, double const& low, Tensor const& high);
 
 /**
  * Clip (limit) the values of a tensor. Given some interval of values, set
@@ -1031,7 +1013,7 @@ FL_API Tensor clip(const Tensor& tensor, const double& low, const Tensor& high);
  * @param[in] high a scalar to use as the maximum value in clipping
  * @return a tensor with all values clipped between high and low
  */
-FL_API Tensor clip(const Tensor& tensor, const double& low, const double& high);
+FL_API Tensor clip(Tensor const& tensor, double const& low, double const& high);
 
 /**
  * Rolls (or shifts) a tensor by a certain amount along a given axis, moving
@@ -1044,7 +1026,7 @@ FL_API Tensor clip(const Tensor& tensor, const double& low, const double& high);
  * @return a tensor with values shifted by the given amount in a circular
  * fashion
  */
-FL_API Tensor roll(const Tensor& tensor, const int shift, const unsigned axis);
+FL_API Tensor roll(Tensor const& tensor, int shift, unsigned axis);
 
 /**
  * Returns a boolean tensor which is true where the input tensor was NaN, and
@@ -1054,7 +1036,7 @@ FL_API Tensor roll(const Tensor& tensor, const int shift, const unsigned axis);
  * @return a boolean tensor with true in positions that contained NaN in the
  * input tensor
  */
-FL_API Tensor isnan(const Tensor& tensor);
+FL_API Tensor isnan(Tensor const& tensor);
 
 /**
  * Returns a boolean tensor which is true where the input tensor was infinity,
@@ -1064,7 +1046,7 @@ FL_API Tensor isnan(const Tensor& tensor);
  * @return a boolean tensor with true in positions that contained Inf in the
  * input tensor
  */
-FL_API Tensor isinf(const Tensor& tensor);
+FL_API Tensor isinf(Tensor const& tensor);
 
 /**
  * Returns a tensor that contains -1 if an element is less than 0, 0 if an
@@ -1074,7 +1056,7 @@ FL_API Tensor isinf(const Tensor& tensor);
  * @param[in] tensor the input tensor
  * @return a tensor containing element-wise sign values.
  */
-FL_API Tensor sign(const Tensor& tensor);
+FL_API Tensor sign(Tensor const& tensor);
 
 /**
  * Returns an upper triangular version of the tensor.
@@ -1087,7 +1069,7 @@ FL_API Tensor sign(const Tensor& tensor);
  * @return a copy of the input tensor with elements above the diagonal zeroed
  * out
  */
-FL_API Tensor tril(const Tensor& tensor);
+FL_API Tensor tril(Tensor const& tensor);
 
 /**
  * Returns an upper triangular version of the tensor.
@@ -1100,7 +1082,7 @@ FL_API Tensor tril(const Tensor& tensor);
  * @return a copy of the input tensor with elements below the diagonal zeroed
  * out
  */
-FL_API Tensor triu(const Tensor& tensor);
+FL_API Tensor triu(Tensor const& tensor);
 
 /**
  * Conditionally return elements from one of two tensors based on a condition.
@@ -1116,7 +1098,7 @@ FL_API Tensor triu(const Tensor& tensor);
  * @return the resulting tensor that contains elements of x where condition is
  * true and elements of y where condition is false.
  */
-FL_API Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y);
+FL_API Tensor where(Tensor const& condition, Tensor const& x, Tensor const& y);
 
 /**
  * Conditionally return elements from a tensor or passed scalar based on a
@@ -1132,7 +1114,7 @@ FL_API Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y);
  * @return the resulting tensor that contains elements of x where condition is
  * true and the scalar value y where the condition is false.
  */
-FL_API Tensor where(const Tensor& condition, const Tensor& x, const double& y);
+FL_API Tensor where(Tensor const& condition, Tensor const& x, double const& y);
 
 /**
  * Conditionally return elements from a scalar or passed tensor based on a
@@ -1148,12 +1130,12 @@ FL_API Tensor where(const Tensor& condition, const Tensor& x, const double& y);
  * @return the resulting tensor that contains elements of x where condition is
  * true and the scalar value y where the condition is false.
  */
-FL_API Tensor where(const Tensor& condition, const double& x, const Tensor& y);
+FL_API Tensor where(Tensor const& condition, double const& x, Tensor const& y);
 
 /*!
  * Sorting mode for sorting-related functions.
  */
-enum class SortMode {Descending = 0, Ascending = 1};
+enum class SortMode { Descending = 0, Ascending = 1 };
 
 /**
  * Get the top-k values and indices from a Tensor.
@@ -1170,10 +1152,10 @@ enum class SortMode {Descending = 0, Ascending = 1};
 FL_API void topk(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const unsigned k,
-    const Dim axis,
-    const SortMode sortMode = SortMode::Descending
+    Tensor const& input,
+    unsigned k,
+    Dim axis,
+    SortMode sortMode = SortMode::Descending
 );
 
 /**
@@ -1184,9 +1166,9 @@ FL_API void topk(
  * @param[in] sortMode the ordering with which to sort. Defaults to ascending
  */
 FL_API Tensor sort(
-    const Tensor& input,
-    const Dim axis,
-    const SortMode sortMode = SortMode::Ascending
+    Tensor const& input,
+    Dim axis,
+    SortMode sortMode = SortMode::Ascending
 );
 
 /**
@@ -1201,9 +1183,9 @@ FL_API Tensor sort(
 FL_API void sort(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const Dim axis,
-    const SortMode sortMode = SortMode::Ascending
+    Tensor const& input,
+    Dim axis,
+    SortMode sortMode = SortMode::Ascending
 );
 
 /**
@@ -1214,9 +1196,9 @@ FL_API void sort(
  * @param[in] sortMode the ordering with which to sort. Defaults to ascending
  */
 FL_API Tensor argsort(
-    const Tensor& input,
-    const Dim axis,
-    const SortMode sortMode = SortMode::Ascending
+    Tensor const& input,
+    Dim axis,
+    SortMode sortMode = SortMode::Ascending
 );
 
 /************************** Binary Operators ***************************/
@@ -1280,7 +1262,7 @@ FL_BINARY_OP_DECL(>>, rShift);
  * @param[in] rhs right hand side tensor for the minimum
  * @return a tensor containing the minimum values in each tensor
  */
-FL_API Tensor minimum(const Tensor& lhs, const Tensor& rhs);
+FL_API Tensor minimum(Tensor const& lhs, Tensor const& rhs);
 
 /**
  * Returns the element-wise minimum of tensor elements with some scalar.
@@ -1290,7 +1272,7 @@ FL_API Tensor minimum(const Tensor& lhs, const Tensor& rhs);
  * @return a tensor containing the minimum values element-wise with the tensor
  * and a scalar.
  */
-FL_API Tensor minimum(const Tensor& lhs, const double& rhs);
+FL_API Tensor minimum(Tensor const& lhs, double const& rhs);
 
 /**
  * Returns the element-wise minimum of tensor elements with some scalar.
@@ -1300,7 +1282,7 @@ FL_API Tensor minimum(const Tensor& lhs, const double& rhs);
  * @return a tensor containing the minimum values element-wise with the tensor
  * and a scalar.
  */
-FL_API Tensor minimum(const double& lhs, const Tensor& rhs);
+FL_API Tensor minimum(double const& lhs, Tensor const& rhs);
 
 /**
  * Returns the element-wise maximum of tensor elements.
@@ -1311,7 +1293,7 @@ FL_API Tensor minimum(const double& lhs, const Tensor& rhs);
  * @param[in] rhs right hand side tensor for the minimum
  * @return a tensor containing the maximum values in each tensor
  */
-FL_API Tensor maximum(const Tensor& lhs, const Tensor& rhs);
+FL_API Tensor maximum(Tensor const& lhs, Tensor const& rhs);
 
 /**
  * Returns the element-wise maximum of tensor elements with some scalar.
@@ -1321,7 +1303,7 @@ FL_API Tensor maximum(const Tensor& lhs, const Tensor& rhs);
  * @return a tensor containing the maximum values element-wise with the tensor
  * and a scalar.
  */
-FL_API Tensor maximum(const Tensor& lhs, const double& rhs);
+FL_API Tensor maximum(Tensor const& lhs, double const& rhs);
 
 /**
  * Returns the element-wise maximum of tensor elements with some scalar.
@@ -1331,7 +1313,7 @@ FL_API Tensor maximum(const Tensor& lhs, const double& rhs);
  * @return a tensor containing the maximum values element-wise with the tensor
  * and a scalar.
  */
-FL_API Tensor maximum(const double& lhs, const Tensor& rhs);
+FL_API Tensor maximum(double const& lhs, Tensor const& rhs);
 
 /**
  * Returns the element-wise exponentiation of tensors; the left hand tensor is
@@ -1341,7 +1323,7 @@ FL_API Tensor maximum(const double& lhs, const Tensor& rhs);
  * @param[in] rhs the exponent tensor
  * @return a tensor containing the exponentiated values
  */
-FL_API Tensor power(const Tensor& lhs, const Tensor& rhs);
+FL_API Tensor power(Tensor const& lhs, Tensor const& rhs);
 
 /**
  * Returns the element-wise exponentiation of tensors raised to some scalar
@@ -1351,7 +1333,7 @@ FL_API Tensor power(const Tensor& lhs, const Tensor& rhs);
  * @param[in] rhs a scalar exponent
  * @return a tensor containing the exponentiated values
  */
-FL_API Tensor power(const Tensor& lhs, const double& rhs);
+FL_API Tensor power(Tensor const& lhs, double const& rhs);
 
 /**
  * Returns the element-wise exponentiation of a scalar raised element-wise to
@@ -1361,7 +1343,7 @@ FL_API Tensor power(const Tensor& lhs, const double& rhs);
  * @param[in] rhs the tensor containing exponent values
  * @return a tensor containing the exponentiated values
  */
-FL_API Tensor power(const double& lhs, const Tensor& rhs);
+FL_API Tensor power(double const& lhs, Tensor const& rhs);
 
 /******************************* BLAS ********************************/
 
@@ -1369,7 +1351,7 @@ FL_API Tensor power(const double& lhs, const Tensor& rhs);
  * Transformations to apply to Tensors (i.e. matrices) before applying certain
  * operations (i.e. matmul).
  */
-enum class MatrixProperty {None = 0, Transpose = 1};
+enum class MatrixProperty { None = 0, Transpose = 1 };
 
 /**
  * Perform matrix multiplication between two tensors.
@@ -1384,8 +1366,8 @@ enum class MatrixProperty {None = 0, Transpose = 1};
  * @return an output tensor containing the matrix product.
  */
 FL_API Tensor matmul(
-    const Tensor& lhs,
-    const Tensor& rhs,
+    Tensor const& lhs,
+    Tensor const& rhs,
     MatrixProperty lhsProp = MatrixProperty::None,
     MatrixProperty rhsProp = MatrixProperty::None
 );
@@ -1404,9 +1386,9 @@ FL_API Tensor matmul(
  * @return a tensor containing the max(es)
  */
 FL_API Tensor amin(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
 /**
@@ -1421,9 +1403,9 @@ FL_API Tensor amin(
  * @return a tensor containing the max(es)
  */
 FL_API Tensor amax(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
 /**
@@ -1442,9 +1424,9 @@ FL_API Tensor amax(
 FL_API void min(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const unsigned axis,
-    const bool keepDims = false
+    Tensor const& input,
+    unsigned axis,
+    bool keepDims = false
 );
 
 /**
@@ -1463,9 +1445,9 @@ FL_API void min(
 FL_API void max(
     Tensor& values,
     Tensor& indices,
-    const Tensor& input,
-    const unsigned axis,
-    const bool keepDims = false
+    Tensor const& input,
+    unsigned axis,
+    bool keepDims = false
 );
 
 /**
@@ -1477,7 +1459,7 @@ FL_API void max(
  * as singleton dimensions rather than collapsing them
  * @return a tensor containing the indices of the max values along each axis
  */
-FL_API Tensor argmax(const Tensor& input, const unsigned axis, const bool keepDims = false);
+FL_API Tensor argmax(Tensor const& input, unsigned axis, bool keepDims = false);
 
 /**
  * Return the indices of the minimum values along an axis.
@@ -1488,7 +1470,7 @@ FL_API Tensor argmax(const Tensor& input, const unsigned axis, const bool keepDi
  * as singleton dimensions rather than collapsing them
  * @return a tensor containing the indices of the max values along each axis
  */
-FL_API Tensor argmin(const Tensor& input, const unsigned axis, const bool keepDims = false);
+FL_API Tensor argmin(Tensor const& input, unsigned axis, bool keepDims = false);
 
 /**
  * Sum of tensor over given axes. If axes is left empty, computes the sum along
@@ -1502,9 +1484,9 @@ FL_API Tensor argmin(const Tensor& input, const unsigned axis, const bool keepDi
  * @return a tensor containing the sum(s)
  */
 FL_API Tensor sum(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
 /**
@@ -1515,7 +1497,7 @@ FL_API Tensor sum(
  * @param[in] axis the axis along which to accumulate
  * @return a tensor of the same shape containing the accumulated sum
  */
-FL_API Tensor cumsum(const Tensor& input, const unsigned axis);
+FL_API Tensor cumsum(Tensor const& input, unsigned axis);
 
 /**
  * Mean of tensor over given axes. If axes is left empty, computes the mean
@@ -1529,9 +1511,9 @@ FL_API Tensor cumsum(const Tensor& input, const unsigned axis);
  * @return a tensor containing the mean(s)
  */
 FL_API Tensor mean(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
 /**
@@ -1546,9 +1528,9 @@ FL_API Tensor mean(
  * @return a tensor containing the median(s)
  */
 FL_API Tensor median(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
 /**
@@ -1564,10 +1546,10 @@ FL_API Tensor median(
  * @return a tensor containing the variance(s)
  */
 FL_API Tensor var(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool bias = false,
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool bias = false,
+    bool keepDims = false
 );
 
 /**
@@ -1582,9 +1564,9 @@ FL_API Tensor var(
  * @return a tensor containing the standard deviation(s)
  */
 FL_API Tensor std(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
 /**
@@ -1599,10 +1581,10 @@ FL_API Tensor std(
  * @return a tensor containing the norm(s)
  */
 FL_API Tensor norm(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
+    Tensor const& input,
+    std::vector<int> const& axes = {},
     double p = 2,
-    const bool keepDims = false
+    bool keepDims = false
 );
 
 /**
@@ -1619,9 +1601,9 @@ FL_API Tensor norm(
  * over the entire tensor.
  */
 FL_API Tensor countNonzero(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
 /**
@@ -1638,12 +1620,21 @@ FL_API Tensor countNonzero(
  * @return a bool tensor containing axis-wise values denoting truthy values
  * along that axis in the input tensor.
  */
-FL_API Tensor any(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+FL_API Tensor any_of(
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
+/**
+ * @deprecated use @ref fl::any_of(Tensor const&, std::vector<int> const&, bool)
+ */
+FL_API inline Tensor any(
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
+) { return any_of(input, axes, keepDims); }
+
 /**
  * Checks if all values are true in a tensor along one or more axes; returns
  * true if all are true and false otherwise. If k axes are passed, returns a
@@ -1658,25 +1649,34 @@ FL_API Tensor any(
  * @return a bool tensor containing axis-wise values with true along
  * axes that contain only true values.
  */
-FL_API Tensor all(
-    const Tensor& input,
-    const std::vector<int>& axes = {},
-    const bool keepDims = false
+FL_API Tensor all_of(
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
 );
 
+/**
+ * @deprecated use @ref fl::all_of(Tensor const&, std::vector<int> const&, bool)
+ */
+FL_API inline Tensor all(
+    Tensor const& input,
+    std::vector<int> const& axes = {},
+    bool keepDims = false
+) { return all_of(input, axes, keepDims); }
+
 /************************** Utilities ***************************/
 
 /**
  * Write a string representation of a tensor to an output stream.
  */
-FL_API std::ostream& operator<<(std::ostream& ostr, const Tensor& t);
+FL_API std::ostream& operator<<(std::ostream& ostr, Tensor const& t);
 
 /**
  * Print a string representation of a tensor to standard out.
  *
  * @param[in] tensor the tensor to print
  */
-FL_API void print(const Tensor& tensor);
+FL_API void print(Tensor const& tensor);
 
 /**
  * Returns of two tensors are close. Checks:
@@ -1691,15 +1691,15 @@ FL_API void print(const Tensor& tensor);
  * tensors
  */
 FL_API bool allClose(
-    const fl::Tensor& a,
-    const fl::Tensor& b,
-    const double absTolerance = 1e-5
+    fl::Tensor const& a,
+    fl::Tensor const& b,
+    double absTolerance = 1e-5
 );
 
 /**
  * @return if a Tensor contains any NaN or Inf values.
  */
-FL_API bool isInvalidArray(const Tensor& tensor);
+FL_API bool isInvalidArray(Tensor const& tensor);
 
 /**
  * Get a string representation of a tensor backend type.
@@ -1707,7 +1707,7 @@ FL_API bool isInvalidArray(const Tensor& tensor);
  * @param[in] type the tensor backend type.
  * @return a string representing the given tensor backend type.
  */
-FL_API std::string tensorBackendTypeToString(const TensorBackendType type);
+FL_API std::string tensorBackendTypeToString(TensorBackendType type);
 
 /**
  * Write a string representation of a tensor backend type to an output stream.
@@ -1716,7 +1716,7 @@ FL_API std::string tensorBackendTypeToString(const TensorBackendType type);
  * @param[in] type the tensor backend type.
  * @return the output stream.
  */
-FL_API std::ostream& operator<<(std::ostream& os, const TensorBackendType type);
+FL_API std::ostream& operator<<(std::ostream& os, TensorBackendType type);
 
 /**
  * Convert a tensor from one type to another. Requires moving the input Tensor
@@ -1754,13 +1754,13 @@ Tensor to(Tensor&& t) {
 
 namespace detail {
 
-    bool areTensorTypesEqual(const Tensor& a, const Tensor& b);
+    bool areTensorTypesEqual(Tensor const& a, Tensor const& b);
 
     template<typename... Args>
     bool areTensorTypesEqual(
-        const Tensor& a,
-        const Tensor& b,
-        const Args&... args
+        Tensor const& a,
+        Tensor const& b,
+        Args const&... args
     ) { return areTensorTypesEqual(a, b) && areTensorTypesEqual(a, args...); }
 
 } // namespace detail
diff --git a/flashlight/fl/tensor/Types.cpp b/flashlight/fl/tensor/Types.cpp
index 8625593..8cc152f 100644
--- a/flashlight/fl/tensor/Types.cpp
+++ b/flashlight/fl/tensor/Types.cpp
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
 
 #include "flashlight/fl/tensor/Types.h"
@@ -12,74 +12,26 @@
 
 namespace fl {
 
-const std::unordered_map<dtype, std::string> kTypeToString = {
-    {dtype::f16, "f16"},
-    {dtype::f32, "f32"},
-    {dtype::f64, "f64"},
-    {dtype::b8, "b8"},
-    {dtype::s16, "s16"},
-    {dtype::s32, "s32"},
-    {dtype::s64, "s64"},
-    {dtype::u8, "u8"},
-    {dtype::u16, "u16"},
-    {dtype::u32, "u32"},
-    {dtype::u64, "u64"},
-};
+auto const STRING_DTYPE_MAP = [] {
+    std::unordered_map<std::string_view, dtype> map{};
+    map.reserve(detail::DTYPES_SIZE);
+    for(size_t i = 0; i < detail::DTYPES_SIZE; i++) {
+        auto type = static_cast<dtype>(i);
+        map.emplace(to_string(type), type);
+    }
+    return map;
+}();
 
-const std::unordered_map<std::string, dtype> kStringToType = {
-    {"f16", dtype::f16},
-    {"f32", dtype::f32},
-    {"f64", dtype::f64},
-    {"b8", dtype::b8},
-    {"s16", dtype::s16},
-    {"s32", dtype::s32},
-    {"s64", dtype::s64},
-    {"u8", dtype::u8},
-    {"u16", dtype::u16},
-    {"u32", dtype::u32},
-    {"u64", dtype::u64},
-};
 
-size_t getTypeSize(dtype type) {
-    switch(type) {
-        case dtype::f16:
-            return sizeof(float) / 2;
-        case dtype::f32:
-            return sizeof(float);
-        case dtype::f64:
-            return sizeof(double);
-        case dtype::b8:
-            return sizeof(unsigned char);
-        case dtype::s16:
-            return sizeof(short);
-        case dtype::s64:
-            return sizeof(long long);
-        case dtype::s32:
-            return sizeof(int);
-        case dtype::u8:
-            return sizeof(unsigned char);
-        case dtype::u16:
-            return sizeof(unsigned short);
-        case dtype::u32:
-            return sizeof(unsigned);
-        case dtype::u64:
-            return sizeof(unsigned long long);
-        default:
-            throw std::invalid_argument("getTypeSize - invalid type queried.");
-    }
-}
+std::optional<fl::dtype> dtype_from_string(std::string_view str) {
+    auto const it = STRING_DTYPE_MAP.find(str);
 
-const std::string& dtypeToString(dtype type) { return kTypeToString.at(type); }
+    if(it == STRING_DTYPE_MAP.end())
+        return {};
 
-fl::dtype stringToDtype(const std::string& string) {
-    if(kStringToType.find(string) != kStringToType.end())
-        return kStringToType.at(string);
-    throw std::invalid_argument("stringToDtype: Invalid input type: " + string);
+    return {it->second};
 }
 
-std::ostream& operator<<(std::ostream& ostr, const dtype& s) {
-    ostr << dtypeToString(s);
-    return ostr;
-}
+
 
 } // namespace fl
diff --git a/flashlight/fl/tensor/Types.h b/flashlight/fl/tensor/Types.h
index 16f201a..f4c2bea 100644
--- a/flashlight/fl/tensor/Types.h
+++ b/flashlight/fl/tensor/Types.h
@@ -1,45 +1,183 @@
 /*
- * Copyright (c) Facebook, Inc. 6and its affiliates.
+ * SPDX-License-Identifier: MIT
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * Original code: Copyright (c) Meta Platforms, Inc. (see FLASHLIGHT_LICENSE)
+ * Modifications: Copyright (c) 2026 Lukas Thomann (see LICENSE)
  */
-
 #pragma once
 
+#include "flashlight/fl/common/Defines.h"
+
+#include <array>
+#include <concepts>
+#include <optional>
 #include <ostream>
 #include <string>
 
-#include "flashlight/fl/common/Defines.h"
-
 namespace fl {
 
+
+/**
+ * Enumeration of all supported types
+ */
 enum class dtype {
-    f16 = 0, // 16-bit float
-    f32 = 1, // 32-bit float
-    f64 = 2, // 64-bit float
-    b8 = 3, // 8-bit boolean
-    s16 = 4, // 16-bit signed integer
-    s32 = 5, // 32-bit signed integer
-    s64 = 6, // 64-bit signed integer
-    u8 = 7, // 8-bit unsigned integer
-    u16 = 8, // 16-bit unsigned integer
-    u32 = 9, // 32-bit unsigned integer
-    u64 = 10 // 64-bit unsigned integer
-        // TODO: add support for complex-valued tensors? (AF)
+    f16, // 16-bit float
+    f32, // 32-bit float
+    f64, // 64-bit float
+    b8, // 8-bit boolean
+    s16, // 16-bit signed integer
+    s32, // 32-bit signed integer
+    s64, // 64-bit signed integer
+    u8, // 8-bit unsigned integer
+    u16, // 16-bit unsigned integer
+    u32, // 32-bit unsigned integer
+    u64, // 64-bit unsigned integer
+
+    DTYPES_SIZE
 };
+/**
+ * Enumeration of the different type groups in @ref dtype
+ */
+enum class dtype_group {
+    FLOAT,
+    BOOL,
+    SIGNED,
+    UNSIGNED,
+
+    DTYPE_GROUPS_SIZE
+};
+
+
+
+constexpr std::string_view to_string(dtype e) {
+    switch(e) {
+        case dtype::f16: return "f16";
+        case dtype::f32: return "f32";
+        case dtype::f64: return "f64";
+        case dtype::b8: return "b8";
+        case dtype::s16: return "s16";
+        case dtype::s32: return "s32";
+        case dtype::s64: return "s64";
+        case dtype::u8: return "u8";
+        case dtype::u16: return "u16";
+        case dtype::u32: return "u32";
+        case dtype::u64: return "u64";
+        default: return "unknown";
+    }
+}
+
+[[nodiscard]] constexpr auto to_index(dtype d) { return static_cast<std::underlying_type_t<dtype>>(d); }
+
+[[nodiscard]] constexpr auto to_index(dtype_group d) { return static_cast<std::underlying_type_t<dtype_group>>(d); }
+
+
+/**
+ * Library details, may change
+ */
+namespace detail {
+    constexpr size_t DTYPES_SIZE = to_index(dtype::DTYPES_SIZE);
+
+    /**
+     * Array of dtype byte sizes
+     */
+    constexpr auto dtype_sizes = [] {
+        std::array<size_t, DTYPES_SIZE> sizes{};
+        sizes[to_index(dtype::f16)] = 2;
+        sizes[to_index(dtype::f32)] = 4;
+        sizes[to_index(dtype::f64)] = 8;
+        sizes[to_index(dtype::b8)] = 1;
+        sizes[to_index(dtype::s16)] = 2;
+        sizes[to_index(dtype::s32)] = 4;
+        sizes[to_index(dtype::s64)] = 8;
+        sizes[to_index(dtype::u8)] = 1;
+        sizes[to_index(dtype::u16)] = 2;
+        sizes[to_index(dtype::u32)] = 4;
+        sizes[to_index(dtype::u64)] = 8;
+        return sizes;
+    }();
+
+    constexpr size_t DTYPE_GROUPS = to_index(dtype_group::DTYPE_GROUPS_SIZE);
+
+    /**
+     * Gets the dtype group for a c++ standard type
+     * @tparam T to get group for
+     * @return dtype group
+     */
+    template<class T>
+    constexpr dtype_group dtype_group_from_type() {
+        if constexpr(std::is_floating_point_v<T>) return dtype_group::FLOAT;
+        else if constexpr(std::same_as<T, bool> || std::same_as<T, char>) return dtype_group::BOOL;
+        else if constexpr(std::is_signed_v<T>) return dtype_group::SIGNED;
+        else if constexpr(std::is_unsigned_v<T>) return dtype_group::UNSIGNED;
+        else
+            static_assert(DTYPE_GROUPS != 4, "unknown type group");
+        return dtype_group{0};
+    }
+
+    constexpr auto dtype_group_begins = [] {
+        std::array<dtype, detail::DTYPE_GROUPS> begins{};
+        begins[to_index(dtype_group::FLOAT)] = dtype::f16;
+        begins[to_index(dtype_group::BOOL)] = dtype::b8;
+        begins[to_index(dtype_group::SIGNED)] = dtype::s16;
+        begins[to_index(dtype_group::UNSIGNED)] = dtype::u8;
+        return begins;
+    }();
+
+    constexpr auto dtype_group_lasts = [] {
+        std::array<dtype, detail::DTYPE_GROUPS> lasts{};
+        lasts[to_index(dtype_group::FLOAT)] = dtype::f64;
+        lasts[to_index(dtype_group::BOOL)] = dtype::b8;
+        lasts[to_index(dtype_group::SIGNED)] = dtype::s64;
+        lasts[to_index(dtype_group::UNSIGNED)] = dtype::u64;
+        return lasts;
+    }();
+}
+
+/**
+ * Gets the dtypes size in bytes
+ * @param[in] type to get size of
+ */
+[[nodiscard]] FL_API constexpr size_t size_of(dtype type) { return detail::dtype_sizes[to_index(type)]; }
+
+/**
+ * Gets the dtype groups first dtype enum index
+ * @param[in] group dtype group
+ */
+[[nodiscard]] FL_API constexpr size_t begin_of(dtype_group group) {
+    return to_index(detail::dtype_group_begins[to_index(group)]);
+}
+/**
+ * Gets the groups dtype enum end index (exclusive)
+ */
+[[nodiscard]] FL_API constexpr size_t end_of(dtype_group group) {
+    return to_index(detail::dtype_group_lasts[to_index(group)]) + 1;
+}
+/**
+ * Gets the size of the dtype group in the dtype enum
+ */
+[[nodiscard]] FL_API constexpr size_t size_of(dtype_group group) { return end_of(group) - begin_of(group); }
+
 
 /**
  * Returns the size of the type in bytes.
  *
  * @param[in] type the input type to query.
+ * @deprecated use @ref size_of(dtype) instead
  */
-FL_API size_t getTypeSize(dtype type);
+FL_API inline size_t getTypeSize(dtype type) { return size_of(type); }
 
 /**
  * Convert a dtype to its string representation.
+ * @deprecated use @ref to_string(fl::dtype) instead
+ */
+FL_API inline std::string dtypeToString(dtype type) { return std::string{to_string(type)}; }
+
+/**
+ * Tries to parse dtype from string
+ * @param str type name
+ * @return dtype or empty if not found
  */
-FL_API const std::string& dtypeToString(dtype type);
+FL_API std::optional<fl::dtype> dtype_from_string(std::string_view str);
 
 /**
  * Converts string to a Flashlight dtype
@@ -47,40 +185,112 @@ FL_API const std::string& dtypeToString(dtype type);
  * @param[in] string type name as a string.
  *
  * @return returns the corresponding Flashlight dtype
+ * @deprecated use @dtype_from_string(std::string_view) instead
  */
-FL_API fl::dtype stringToDtype(const std::string& string);
+FL_API inline fl::dtype stringToDtype(std::string const& string) { return *dtype_from_string(string); }
+
 
 /**
  * Write a type's string representation to an output stream.
  */
-FL_API std::ostream& operator<<(std::ostream& ostr, const dtype& s);
+FL_API inline std::ostream& operator<<(std::ostream& ostream, dtype const& s) {
+    ostream << to_string(s);
+    return ostream;
+}
+
+namespace detail {
+    template<class T>
+    struct FL_API dtype_traits_base {
+        static constexpr dtype fl_type = [] {
+            auto group = dtype_group_from_type<T>();
+
+            for(size_t i = begin_of(group); i < end_of(group); ++i)
+                if(auto type = static_cast<dtype>(i); size_of(type) == sizeof(T))
+                    return type;
+
+            throw std::logic_error{"unknown type size requested"};
+        }();
 
-template<typename T>
+        using base_type = T;
+    };
+}
+
+template<class T>
 struct dtype_traits;
 
-#define FL_TYPE_TRAIT(BASE_TYPE, DTYPE, CONSTANT_TYPE, STRING_NAME)            \
-        template<>                                                             \
-        struct FL_API dtype_traits<BASE_TYPE> {                                \
-            static const dtype fl_type = DTYPE; /* corresponding dtype */      \
-            static const dtype ctype = CONSTANT_TYPE; /* constant init type */ \
-            typedef BASE_TYPE base_type;                                       \
-            static const char* getName() {                                     \
-                return STRING_NAME;                                            \
-            }                                                                  \
+
+#define FL_TYPE_TRAIT(T)                                               \
+    template<>                                                         \
+    struct FL_API dtype_traits<T> : detail::dtype_traits_base<T> {     \
+        static constexpr std::string_view name() {                     \
+            return #T;                                                 \
+        }                                                              \
+        /* deprecated, use @ref name() instead */                      \
+        static constexpr const char* getName() {                       \
+            return #T;                                                 \
+        }                                                              \
+    };
+
+// using fundamental types instead of fixed to avoid missing templates when multiple fundamentals are equal size
+
+FL_TYPE_TRAIT(float);
+FL_TYPE_TRAIT(double);
+FL_TYPE_TRAIT(int);
+FL_TYPE_TRAIT(unsigned int);
+FL_TYPE_TRAIT(char);
+FL_TYPE_TRAIT(unsigned char);
+FL_TYPE_TRAIT(long);
+FL_TYPE_TRAIT(unsigned long);
+FL_TYPE_TRAIT(long long);
+FL_TYPE_TRAIT(unsigned long long);
+FL_TYPE_TRAIT(bool);
+FL_TYPE_TRAIT(short);
+FL_TYPE_TRAIT(unsigned short);
+
+namespace detail {
+    //TODO add c++23 float16_t once version is bumped
+    using fundamental_types = std::tuple<
+        float,
+        double,
+        char,
+        unsigned char,
+        short,
+        unsigned short,
+        int,
+        unsigned int,
+        long,
+        unsigned long,
+        long long,
+        unsigned long long>;
+}
+
+template<class R = void, class Func>
+R dispatch_dtype(fl::dtype type, Func&& func) {
+    R result{};
+    bool found = false;
+
+    auto try_dispatch = [&found, &result, type, &func]<class Type>() {
+        if(!found && fl::dtype_traits<Type>::fl_type == type) {
+            if constexpr(std::is_void_v<R>)
+                func.template operator()<Type>();
+            else
+                result = func.template operator()<Type>();
+            found = true;
         }
+    };
+
+    [&]<class... Ts>(std::tuple<Ts...>) { (try_dispatch.template operator()<Ts>(), ...); }(
+        detail::fundamental_types{}
+    );
+
+    if(!found)
+        throw std::invalid_argument("Unsupported dtype for dispatch");
 
-FL_TYPE_TRAIT(float, dtype::f32, dtype::f32, "float");
-FL_TYPE_TRAIT(double, dtype::f64, dtype::f32, "double");
-FL_TYPE_TRAIT(int, dtype::s32, dtype::s32, "int");
-FL_TYPE_TRAIT(unsigned, dtype::u32, dtype::u32, "unsigned int");
-FL_TYPE_TRAIT(char, dtype::b8, dtype::s32, "char");
-FL_TYPE_TRAIT(unsigned char, dtype::u8, dtype::u32, "unsigned char");
-FL_TYPE_TRAIT(long, dtype::s64, dtype::s32, "long int");
-FL_TYPE_TRAIT(unsigned long, dtype::u64, dtype::u32, "unsigned long");
-FL_TYPE_TRAIT(long long, dtype::s64, dtype::s64, "long long");
-FL_TYPE_TRAIT(unsigned long long, dtype::u64, dtype::u64, "unsigned long long");
-FL_TYPE_TRAIT(bool, dtype::u8, dtype::u8, "bool");
-FL_TYPE_TRAIT(short, dtype::s16, dtype::s16, "short");
-FL_TYPE_TRAIT(unsigned short, dtype::u16, dtype::u16, "short");
+    // C++17 feature: only return if R isn't void
+    if constexpr(!std::is_void_v<R>)
+        return result;
+    else
+        return;
+}
 
 } // namespace fl
diff --git a/flashlight/fl/tensor/backend/af/ArrayFireBLAS.cpp b/flashlight/fl/tensor/backend/af/ArrayFireBLAS.cpp
index 610004b..cff7316 100644
--- a/flashlight/fl/tensor/backend/af/ArrayFireBLAS.cpp
+++ b/flashlight/fl/tensor/backend/af/ArrayFireBLAS.cpp
@@ -20,7 +20,7 @@ Tensor ArrayFireBackend::matmul(
     MatrixProperty lhsProp,
     MatrixProperty rhsProp
 ) {
-    unsigned numDims = std::max(lhs.ndim(), rhs.ndim());
+    auto numDims = std::max(lhs.ndim(), rhs.ndim());
     if((lhs.ndim() == 1 || rhs.ndim() == 1) && numDims > 1)
         numDims -= 1;
 
diff --git a/flashlight/fl/tensor/backend/af/ArrayFireBackend.cpp b/flashlight/fl/tensor/backend/af/ArrayFireBackend.cpp
index 968bf92..de1f92f 100644
--- a/flashlight/fl/tensor/backend/af/ArrayFireBackend.cpp
+++ b/flashlight/fl/tensor/backend/af/ArrayFireBackend.cpp
@@ -36,8 +36,8 @@ namespace fl {
 
 namespace {
 
-// Get the stream associated with given device in the given map; if it's not in
-// the map, initialize it (by wrapping or creating) and put it into the map.
+    // Get the stream associated with given device in the given map; if it's not in
+    // the map, initialize it (by wrapping or creating) and put it into the map.
     const Stream& getOrWrapAfDeviceStream(
         const int afId,
         const int nativeId,
@@ -97,12 +97,12 @@ ArrayFireBackend::ArrayFireBackend() {
     // Capturing by value to avoid destructor race hazard for static objects.
     const auto setActiveCallback = [nativeIdToId = nativeIdToId_,
             afIdToStream = afIdToStream_](int nativeId) {
-            auto afId = nativeIdToId.at(nativeId);
-            af::setDevice(afId);
-            // this is the latest point we can lazily wrap the AF stream, which may get
-            // lazily intialized anytime in AF internally, e.g., via tensor computation.
-            getOrWrapAfDeviceStream(afId, nativeId, *afIdToStream);
-        };
+        auto afId = nativeIdToId.at(nativeId);
+        af::setDevice(afId);
+        // this is the latest point we can lazily wrap the AF stream, which may get
+        // lazily intialized anytime in AF internally, e.g., via tensor computation.
+        getOrWrapAfDeviceStream(afId, nativeId, *afIdToStream);
+    };
 #if FL_ARRAYFIRE_USE_CPU
     auto& device = manager.getActiveDevice(DeviceType::x64);
     device.addSetActiveCallback(setActiveCallback);
@@ -127,9 +127,7 @@ ArrayFireBackend& ArrayFireBackend::getInstance() {
     return instance;
 }
 
-TensorBackendType ArrayFireBackend::backendType() const {
-    return TensorBackendType::ArrayFire;
-}
+TensorBackendType ArrayFireBackend::backendType() const { return TensorBackendType::ArrayFire; }
 
 /* -------------------------- Compute Functions -------------------------- */
 
@@ -150,13 +148,11 @@ const Stream& ArrayFireBackend::getStreamOfArray(
 
 bool ArrayFireBackend::supportsDataType(const fl::dtype& dtype) const {
     switch(dtype) {
-        case fl::dtype::f16:
-            return af::isHalfAvailable(af::getDevice())
-                   && // f16 isn't [yet] supported with the CPU backend per onednn
-                      // limitations
-                   !FL_BACKEND_CPU;
-        default:
-            return true;
+        case fl::dtype::f16: return af::isHalfAvailable(af::getDevice())
+                && // f16 isn't [yet] supported with the CPU backend per onednn
+                // limitations
+                !FL_BACKEND_CPU;
+        default: return true;
     }
 }
 
@@ -259,7 +255,8 @@ AF_BACKEND_CREATE_FUN_LITERAL_DEF(const unsigned short&);
 
 Tensor ArrayFireBackend::identity(const Dim dim, const dtype type) {
     return toTensor<ArrayFireTensor>(
-        af::identity({dim, dim}, detail::flToAfType(type)), /* numDims = */
+        af::identity({dim, dim}, detail::flToAfType(type)),
+        /* numDims = */
         2
     );
 }
@@ -286,7 +283,8 @@ Tensor ArrayFireBackend::iota(
             detail::flToAfDims(tileDims),
             detail::flToAfType(type)
         ),
-        /* numDims = */ std::max(dims.ndim(), tileDims.ndim())
+        /* numDims = */
+        std::max(dims.ndim(), tileDims.ndim())
     );
 }
 
diff --git a/flashlight/fl/tensor/backend/af/ArrayFireBackend.h b/flashlight/fl/tensor/backend/af/ArrayFireBackend.h
index 698fe33..0d15e14 100644
--- a/flashlight/fl/tensor/backend/af/ArrayFireBackend.h
+++ b/flashlight/fl/tensor/backend/af/ArrayFireBackend.h
@@ -36,10 +36,10 @@ class ArrayFireBackend : public TensorBackend {
     // keep track of the individual active stream on each ArrayFire device
     // NOTE using a `shared_ptr` to allow its capture in setActive callback;
     // see constructor for details.
-    std::shared_ptr<std::unordered_map<int, std::shared_ptr<const Stream>>>
+    std::shared_ptr<std::unordered_map<int, std::shared_ptr<Stream const>>>
     afIdToStream_{
         std::make_shared<
-            std::unordered_map<int, std::shared_ptr<const Stream>>>()
+            std::unordered_map<int, std::shared_ptr<Stream const>>>()
     };
 
     // Intentionally private. Only one instance should exist/it should be accessed
@@ -53,31 +53,31 @@ class ArrayFireBackend : public TensorBackend {
 
     // No copy or move construction or assignment
     ArrayFireBackend(ArrayFireBackend&&) = delete;
-    ArrayFireBackend(const ArrayFireBackend&) = delete;
+    ArrayFireBackend(ArrayFireBackend const&) = delete;
     ArrayFireBackend& operator=(ArrayFireBackend&&) = delete;
-    ArrayFireBackend& operator=(const ArrayFireBackend&) = delete;
+    ArrayFireBackend& operator=(ArrayFireBackend const&) = delete;
 
     /* -------------------------- Compute Functions -------------------------- */
-    void eval(const Tensor& tensor) override;
+    void eval(Tensor const& tensor) override;
 
     /**
      * Return the stream from which the given array was created.
      *
      * @return an immutable reference to the stream from which `arr` was created.
      */
-    const Stream& getStreamOfArray(const af::array& arr);
-    bool supportsDataType(const fl::dtype& dtype) const override;
+    Stream const& getStreamOfArray(af::array const& arr);
+    bool supportsDataType(fl::dtype const& dtype) const override;
     // Memory management
-    void getMemMgrInfo(const char* msg, const int nativeDeviceId, std::ostream* ostream)
+    void getMemMgrInfo(char const* msg, int nativeDeviceId, std::ostream* ostream)
     override;
     void setMemMgrLogStream(std::ostream* stream) override;
-    void setMemMgrLoggingEnabled(const bool enabled) override;
-    void setMemMgrFlushInterval(const size_t interval) override;
+    void setMemMgrLoggingEnabled(bool enabled) override;
+    void setMemMgrFlushInterval(size_t interval) override;
 
     /* -------------------------- Rand Functions -------------------------- */
-    void setSeed(const int seed) override;
-    Tensor randn(const Shape& shape, dtype type) override;
-    Tensor rand(const Shape& shape, dtype type) override;
+    void setSeed(int seed) override;
+    Tensor randn(Shape const& shape, dtype type) override;
+    Tensor rand(Shape const& shape, dtype type) override;
 
     /* --------------------------- Tensor Operators --------------------------- */
     /******************** Tensor Creation Functions ********************/
@@ -99,71 +99,71 @@ class ArrayFireBackend : public TensorBackend {
     AF_BACKEND_CREATE_FUN_LITERAL_DECL(const unsigned short&);
 #undef AF_BACKEND_CREATE_FUN_LITERAL_DECL
 
-    Tensor identity(const Dim dim, const dtype type) override;
-    Tensor arange(const Shape& shape, const Dim seqDim, const dtype type)
+    Tensor identity(Dim dim, dtype type) override;
+    Tensor arange(Shape const& shape, Dim seqDim, dtype type)
     override;
-    Tensor iota(const Shape& dims, const Shape& tileDims, const dtype type)
+    Tensor iota(Shape const& dims, Shape const& tileDims, dtype type)
     override;
 
     /************************ Shaping and Indexing *************************/
-    Tensor reshape(const Tensor& tensor, const Shape& shape) override;
-    Tensor transpose(const Tensor& tensor, const Shape& axes /* = {} */) override;
-    Tensor tile(const Tensor& tensor, const Shape& shape) override;
-    Tensor concatenate(const std::vector<Tensor>& tensors, const unsigned axis)
+    Tensor reshape(Tensor const& tensor, Shape const& shape) override;
+    Tensor transpose(Tensor const& tensor, Shape const& axes /* = {} */) override;
+    Tensor tile(Tensor const& tensor, Shape const& shape) override;
+    Tensor concatenate(std::vector<Tensor> const& tensors, unsigned axis)
     override;
-    Tensor nonzero(const Tensor& tensor) override;
+    Tensor nonzero(Tensor const& tensor) override;
     Tensor pad(
-        const Tensor& input,
-        const std::vector<std::pair<int, int>>& padWidths,
-        const PadType type
+        Tensor const& input,
+        std::vector<std::pair<int, int>> const& padWidths,
+        PadType type
     ) override;
 
     /************************** Unary Operators ***************************/
-    Tensor exp(const Tensor& tensor) override;
-    Tensor log(const Tensor& tensor) override;
-    Tensor negative(const Tensor& tensor) override;
-    Tensor logicalNot(const Tensor& tensor) override;
-    Tensor log1p(const Tensor& tensor) override;
-    Tensor sin(const Tensor& tensor) override;
-    Tensor cos(const Tensor& tensor) override;
-    Tensor sqrt(const Tensor& tensor) override;
-    Tensor tanh(const Tensor& tensor) override;
-    Tensor floor(const Tensor& tensor) override;
-    Tensor ceil(const Tensor& tensor) override;
-    Tensor rint(const Tensor& tensor) override;
-    Tensor absolute(const Tensor& tensor) override;
-    Tensor sigmoid(const Tensor& tensor) override;
-    Tensor erf(const Tensor& tensor) override;
-    Tensor flip(const Tensor& tensor, const unsigned dim) override;
-    Tensor clip(const Tensor& tensor, const Tensor& low, const Tensor& high)
+    Tensor exp(Tensor const& tensor) override;
+    Tensor log(Tensor const& tensor) override;
+    Tensor negative(Tensor const& tensor) override;
+    Tensor logicalNot(Tensor const& tensor) override;
+    Tensor log1p(Tensor const& tensor) override;
+    Tensor sin(Tensor const& tensor) override;
+    Tensor cos(Tensor const& tensor) override;
+    Tensor sqrt(Tensor const& tensor) override;
+    Tensor tanh(Tensor const& tensor) override;
+    Tensor floor(Tensor const& tensor) override;
+    Tensor ceil(Tensor const& tensor) override;
+    Tensor rint(Tensor const& tensor) override;
+    Tensor absolute(Tensor const& tensor) override;
+    Tensor sigmoid(Tensor const& tensor) override;
+    Tensor erf(Tensor const& tensor) override;
+    Tensor flip(Tensor const& tensor, unsigned dim) override;
+    Tensor clip(Tensor const& tensor, Tensor const& low, Tensor const& high)
     override;
-    Tensor roll(const Tensor& tensor, const int shift, const unsigned axis)
+    Tensor roll(Tensor const& tensor, int shift, unsigned axis)
     override;
-    Tensor isnan(const Tensor& tensor) override;
-    Tensor isinf(const Tensor& tensor) override;
-    Tensor sign(const Tensor& tensor) override;
-    Tensor tril(const Tensor& tensor) override;
-    Tensor triu(const Tensor& tensor) override;
-    Tensor where(const Tensor& condition, const Tensor& x, const Tensor& y)
+    Tensor isnan(Tensor const& tensor) override;
+    Tensor isinf(Tensor const& tensor) override;
+    Tensor sign(Tensor const& tensor) override;
+    Tensor tril(Tensor const& tensor) override;
+    Tensor triu(Tensor const& tensor) override;
+    Tensor where(Tensor const& condition, Tensor const& x, Tensor const& y)
     override;
     void topk(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const unsigned k,
-        const Dim axis,
-        const SortMode sortMode
+        Tensor const& input,
+        unsigned k,
+        Dim axis,
+        SortMode sortMode
     ) override;
-    Tensor sort(const Tensor& input, const Dim axis, const SortMode sortMode)
+    Tensor sort(Tensor const& input, Dim axis, SortMode sortMode)
     override;
     void sort(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const Dim axis,
-        const SortMode sortMode
+        Tensor const& input,
+        Dim axis,
+        SortMode sortMode
     ) override;
-    Tensor argsort(const Tensor& input, const Dim axis, const SortMode sortMode)
+    Tensor argsort(Tensor const& input, Dim axis, SortMode sortMode)
     override;
 
     /************************** Binary Operators ***************************/
@@ -212,77 +212,77 @@ class ArrayFireBackend : public TensorBackend {
 #undef FL_AF_BINARY_OP_TYPE_DECL
 #undef FL_AF_BINARY_OP_LITERALS_DECL
 
-    Tensor minimum(const Tensor& lhs, const Tensor& rhs) override;
-    Tensor maximum(const Tensor& lhs, const Tensor& rhs) override;
-    Tensor power(const Tensor& lhs, const Tensor& rhs) override;
+    Tensor minimum(Tensor const& lhs, Tensor const& rhs) override;
+    Tensor maximum(Tensor const& lhs, Tensor const& rhs) override;
+    Tensor power(Tensor const& lhs, Tensor const& rhs) override;
 
     /******************************* BLAS ********************************/
     Tensor matmul(
-        const Tensor& lhs,
-        const Tensor& rhs,
+        Tensor const& lhs,
+        Tensor const& rhs,
         MatrixProperty lhsProp,
         MatrixProperty rhsProp
     ) override;
 
     /************************** Reductions ***************************/
-    Tensor amin(const Tensor& input, const std::vector<int>& axes, const bool keepDims)
+    Tensor amin(Tensor const& input, std::vector<int> const& axes, bool keepDims)
     override;
-    Tensor amax(const Tensor& input, const std::vector<int>& axes, const bool keepDims)
+    Tensor amax(Tensor const& input, std::vector<int> const& axes, bool keepDims)
     override;
     void min(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const unsigned axis,
-        const bool keepDims
+        Tensor const& input,
+        unsigned axis,
+        bool keepDims
     ) override;
     void max(
         Tensor& values,
         Tensor& indices,
-        const Tensor& input,
-        const unsigned axis,
-        const bool keepDims
+        Tensor const& input,
+        unsigned axis,
+        bool keepDims
     ) override;
-    Tensor sum(const Tensor& input, const std::vector<int>& axes, const bool keepDims)
+    Tensor sum(Tensor const& input, std::vector<int> const& axes, bool keepDims)
     override;
-    Tensor cumsum(const Tensor& input, const unsigned axis) override;
-    Tensor argmax(const Tensor& input, const unsigned axis, const bool keepDims)
+    Tensor cumsum(Tensor const& input, unsigned axis) override;
+    Tensor argmax(Tensor const& input, unsigned axis, bool keepDims)
     override;
-    Tensor argmin(const Tensor& input, const unsigned axis, const bool keepDims)
+    Tensor argmin(Tensor const& input, unsigned axis, bool keepDims)
     override;
-    Tensor mean(const Tensor& input, const std::vector<int>& axes, const bool keepDims)
+    Tensor mean(Tensor const& input, std::vector<int> const& axes, bool keepDims)
     override;
     Tensor median(
-        const Tensor& input,
-        const std::vector<int>& axes,
-        const bool keepDims
+        Tensor const& input,
+        std::vector<int> const& axes,
+        bool keepDims
     ) override;
     Tensor var(
-        const Tensor& input,
-        const std::vector<int>& axes,
-        const bool bias,
-        const bool keepDims
+        Tensor const& input,
+        std::vector<int> const& axes,
+        bool bias,
+        bool keepDims
     ) override;
-    Tensor std(const Tensor& input, const std::vector<int>& axes, const bool keepDims)
+    Tensor std(Tensor const& input, std::vector<int> const& axes, bool keepDims)
     override;
     Tensor norm(
-        const Tensor& input,
-        const std::vector<int>& axes,
+        Tensor const& input,
+        std::vector<int> const& axes,
         double p,
-        const bool keepDims
+        bool keepDims
     ) override;
     Tensor countNonzero(
-        const Tensor& input,
-        const std::vector<int>& axes,
-        const bool keepDims
+        Tensor const& input,
+        std::vector<int> const& axes,
+        bool keepDims
     ) override;
-    Tensor any(const Tensor& input, const std::vector<int>& axes, const bool keepDims)
+    Tensor any(Tensor const& input, std::vector<int> const& axes, bool keepDims)
     override;
-    Tensor all(const Tensor& input, const std::vector<int>& axes, const bool keepDims)
+    Tensor all(Tensor const& input, std::vector<int> const& axes, bool keepDims)
     override;
 
     /************************** Utils ***************************/
-    void print(const Tensor& tensor) override;
+    void print(Tensor const& tensor) override;
 };
 
 } // namespace fl
diff --git a/flashlight/fl/tensor/backend/af/ArrayFireShapeAndIndex.cpp b/flashlight/fl/tensor/backend/af/ArrayFireShapeAndIndex.cpp
index 397cd75..f15867b 100644
--- a/flashlight/fl/tensor/backend/af/ArrayFireShapeAndIndex.cpp
+++ b/flashlight/fl/tensor/backend/af/ArrayFireShapeAndIndex.cpp
@@ -94,7 +94,7 @@ Tensor ArrayFireBackend::concatenate(
     af::array out;
     switch(tensors.size()) {
         case 0:
-            return toTensor<ArrayFireTensor>(ArrayFireTensor()); // empty tensor
+            return toTensor<ArrayFireTensor>(ArrayFireTensor{}); // empty tensor
         case 1:
             return tensors.front();
         case 2:
diff --git a/flashlight/fl/tensor/backend/af/Utils.cpp b/flashlight/fl/tensor/backend/af/Utils.cpp
index 8b11f58..f7661bf 100644
--- a/flashlight/fl/tensor/backend/af/Utils.cpp
+++ b/flashlight/fl/tensor/backend/af/Utils.cpp
@@ -244,8 +244,8 @@ af_source flToAfLocation(Location location) {
 }
 
 af::array fromFlData(
-    const Shape& shape,
-    const void* ptr,
+    Shape const& shape,
+    void const* ptr,
     fl::dtype type,
     fl::Location memoryLocation
 ) {
@@ -260,29 +260,29 @@ af::array fromFlData(
     using af::dtype;
     switch(afType) {
         case f32:
-            return af::array(dims, reinterpret_cast<const float*>(ptr), loc);
+            return af::array(dims, static_cast<float const*>(ptr), loc);
         case f64:
-            return af::array(dims, reinterpret_cast<const double*>(ptr), loc);
+            return af::array(dims, static_cast<double const*>(ptr), loc);
         case s32:
-            return af::array(dims, reinterpret_cast<const int*>(ptr), loc);
+            return af::array(dims, static_cast<int const*>(ptr), loc);
         case u32:
-            return af::array(dims, reinterpret_cast<const unsigned*>(ptr), loc);
+            return af::array(dims, static_cast<unsigned const*>(ptr), loc);
         case s64:
-            return af::array(dims, reinterpret_cast<const long long*>(ptr), loc);
+            return af::array(dims, static_cast<long long const*>(ptr), loc);
         case u64:
             return af::array(
                 dims,
-                reinterpret_cast<const unsigned long long*>(ptr),
+                static_cast<unsigned long long const*>(ptr),
                 loc
             );
         case s16:
-            return af::array(dims, reinterpret_cast<const short*>(ptr), loc);
+            return af::array(dims, static_cast<short const*>(ptr), loc);
         case u16:
-            return af::array(dims, reinterpret_cast<const unsigned short*>(ptr), loc);
+            return af::array(dims, static_cast<unsigned short const*>(ptr), loc);
         case b8:
-            return af::array(dims, reinterpret_cast<const char*>(ptr), loc);
+            return af::array(dims, static_cast<char const*>(ptr), loc);
         case u8:
-            return af::array(dims, reinterpret_cast<const unsigned char*>(ptr), loc);
+            return af::array(dims, static_cast<unsigned char const*>(ptr), loc);
         default:
             throw std::invalid_argument(
                 "fromFlData: can't construct ArrayFire array from given type."
diff --git a/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp b/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp
index 844d41f..014ea4d 100644
--- a/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradBinaryOpsTest.cpp
@@ -71,13 +71,13 @@ TEST(AutogradBinaryOpsTest, BinaryCrossEntropy) {
     auto loss = binaryCrossEntropy(x, y);
 
     // bce loss should be positive
-    ASSERT_TRUE(fl::all(loss.tensor() > 0).scalar<char>());
+    ASSERT_TRUE(fl::all_of(loss.tensor() > 0).scalar<char>());
 }
 
 TEST(AutogradBinaryOpsTest, CrossEntropy) {
     auto x = Variable(fl::rand({7, 10, 4}, fl::dtype::f64), true);
     auto y = Variable(
-        (fl::rand({10, 4}, fl::dtype::u32) % 7).astype(fl::dtype::s32),
+        (fl::rand({10, 4}, fl::dtype::u32) % 7).asType(fl::dtype::s32),
         false
     );
     auto ignoreIdx = y(0, 0).scalar<int>();
@@ -165,7 +165,7 @@ TEST_F(AutogradTestF16, LinearF16) {
 TEST(AutogradBinaryOpsTest, Multiply) {
     auto x = Variable(fl::rand({5}), true);
     auto y = x * x;
-    auto dy = Variable(fl::full({5}, 1.0), false);
+    auto dy = Variable(fl::full({5}, 1.f), false);
     y.backward(dy);
     auto dx = x.grad();
     ASSERT_TRUE(allClose(dx.tensor(), 2 * x.tensor()));
@@ -175,7 +175,7 @@ TEST(AutogradBinaryOpsTest, MultiplyAdd) {
     auto x = Variable(fl::rand({5}), true);
     auto y = Variable(fl::rand({5}), true);
     auto z = x * x + x * y + y * y;
-    auto dz = Variable(fl::full({5}, 1.0), false);
+    auto dz = Variable(fl::full({5}, 1.f), false);
     z.backward(dz);
     auto dx = x.grad();
     auto dy = y.grad();
@@ -187,19 +187,19 @@ TEST(AutogradBinaryOpsTest, MultiplyAddScalar) {
     auto x = Variable(fl::rand({5}), true);
     auto y = Variable(fl::rand({5}), true);
     auto z = 2 * x + x * y + y;
-    auto dz = Variable(fl::full({5}, 1.0), false);
+    auto dz = Variable(fl::full({5}, 1.f), false);
     z.backward(dz);
     auto dx = x.grad();
     auto dy = y.grad();
-    ASSERT_TRUE(allClose(dx.tensor(), (2.0 + y.tensor())));
-    ASSERT_TRUE(allClose(dy.tensor(), (1.0 + x.tensor())));
+    ASSERT_TRUE(allClose(dx.tensor(), (2.f + y.tensor())));
+    ASSERT_TRUE(allClose(dy.tensor(), (1.f + x.tensor())));
 }
 
 TEST(AutogradBinaryOpsTest, MultiplySub) {
     auto x = Variable(fl::rand({5}), true);
     auto y = Variable(fl::rand({5}), true);
     auto z = x * x - x * y;
-    auto dz = Variable(fl::full({5}, 1.0), false);
+    auto dz = Variable(fl::full({5}, 1.f), false);
     z.backward(dz);
     auto dx = x.grad();
     auto dy = y.grad();
@@ -211,14 +211,14 @@ TEST(AutogradBinaryOpsTest, DivideAdd) {
     auto x = Variable(fl::rand({5}, fl::dtype::f64), true);
     auto y = Variable(fl::rand({5}, fl::dtype::f64), true);
     auto z = x + x / y + y;
-    auto dz = Variable(fl::full({5}, 1.0, fl::dtype::f64), false);
+    auto dz = Variable(fl::full({5}, 1.f, fl::dtype::f64), false);
     z.backward(dz);
     auto dx = x.grad();
     auto dy = y.grad();
     ASSERT_EQ(z.type(), fl::dtype::f64);
-    ASSERT_TRUE(allClose(dx.tensor(), (1.0 + 1.0 / y.tensor())));
+    ASSERT_TRUE(allClose(dx.tensor(), (1.f + 1.f / y.tensor())));
     ASSERT_TRUE(
-        allClose(dy.tensor(), (1.0 - x.tensor() / (y.tensor() * y.tensor())))
+        allClose(dy.tensor(), (1.f - x.tensor() / (y.tensor() * y.tensor())))
     );
 }
 
diff --git a/flashlight/fl/test/autograd/AutogradReductionTest.cpp b/flashlight/fl/test/autograd/AutogradReductionTest.cpp
index c9425ed..a32c8a6 100644
--- a/flashlight/fl/test/autograd/AutogradReductionTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradReductionTest.cpp
@@ -18,7 +18,7 @@ using namespace fl;
 using fl::detail::AutogradTestF16;
 
 TEST(AutogradReductionTest, Sum) {
-    for(const bool keepDims : {false, true}) {
+    for(bool keepDims : {false, true}) {
         Shape s = {6};
         if(keepDims)
             s = {6, 1};
@@ -27,7 +27,7 @@ TEST(AutogradReductionTest, Sum) {
         auto y = Variable(fl::rand({6, 3}), true);
 
         auto z = x * sum(y, {1}, keepDims);
-        auto dz = Variable(fl::full(s, 1.0), false);
+        auto dz = Variable(fl::full(s, 1.f), false);
         z.backward(dz);
 
         auto dy = y.grad();
@@ -36,9 +36,7 @@ TEST(AutogradReductionTest, Sum) {
         ASSERT_TRUE(allClose(dx.tensor(), fl::sum(y.tensor(), {1}, keepDims)));
 
         // Reduce over 1-dim input
-        auto funcMean_0 = [keepDims](const Variable& in) {
-                return sum(in, {0}, keepDims);
-            };
+        auto funcMean_0 = [keepDims](const Variable& in) { return sum(in, {0}, keepDims); };
         auto in = Variable(fl::rand({6}), true);
         ASSERT_TRUE(fl::detail::jacobianTestImpl(funcMean_0, in, 5E-3));
         // Reduce over scalar input
@@ -56,7 +54,7 @@ TEST(AutogradReductionTest, SumAs) {
     auto x = Variable(fl::rand({5}), true);
     auto y = Variable(fl::rand({5, 2}), true);
     auto z = x * sumAs(y, x);
-    auto dz = Variable(fl::full({5}, 1.0), false);
+    auto dz = Variable(fl::full({5}, 1.f), false);
     z.backward(dz);
     auto dy = y.grad();
     auto dx = x.grad();
@@ -67,20 +65,20 @@ TEST(AutogradReductionTest, SumAs) {
 TEST(AutogradReductionTest, SumAs2) {
     auto y = Variable(fl::rand({5, 2}), true);
     auto z = sumAs(y, {5});
-    auto dz = Variable(fl::full({5}, 1.0), false);
+    auto dz = Variable(fl::full({5}, 1.f), false);
     z.backward(dz);
     auto dy = y.grad();
-    ASSERT_TRUE(allClose(dy.tensor(), fl::full({5, 2}, 1.0)));
+    ASSERT_TRUE(allClose(dy.tensor(), fl::full({5, 2}, 1.f)));
 }
 
 TEST(AutogradReductionTest, Mean) {
-    for(const bool keepDims : {false, true}) {
+    for(bool keepDims : {false, true}) {
         Shape xShape = keepDims ? Shape({5, 1, 1}) : Shape({5});
         auto x = Variable(fl::rand(xShape), true);
         auto y = Variable(fl::rand({5, 3, 2}), true);
         auto varOut = mean(y, {1, 2}, keepDims);
         auto z = x * mean(y, {1, 2}, keepDims);
-        auto dz = Variable(fl::full(x.shape(), 1.0), false);
+        auto dz = Variable(fl::full(x.shape(), 1.f), false);
         z.backward(dz);
         auto dy = y.grad();
         auto dx = x.grad();
@@ -88,9 +86,7 @@ TEST(AutogradReductionTest, Mean) {
         ASSERT_TRUE(allClose(dx.tensor(), fl::mean(y.tensor(), {1, 2}, keepDims)));
 
         auto a = Variable(fl::rand({5, 3, 2}, fl::dtype::f64), true);
-        auto funcMean = [keepDims](Variable& in) {
-                return mean(in, {1, 2}, keepDims);
-            };
+        auto funcMean = [keepDims](Variable& in) { return mean(in, {1, 2}, keepDims); };
         ASSERT_TRUE(fl::detail::jacobianTestImpl(funcMean, a, 1E-4));
 
         auto q = Variable(fl::rand({5, 6, 7, 8}), false);
@@ -98,14 +94,12 @@ TEST(AutogradReductionTest, Mean) {
         auto qOutTensor = fl::mean(q.tensor(), {1, 2}, keepDims);
         ASSERT_TRUE(allClose(qOut.tensor(), qOutTensor));
 
-        auto funcMean_0 = [keepDims](Variable& in) {
-                return mean(in, {0}, keepDims);
-            };
+        auto funcMean_0 = [keepDims](Variable& in) { return mean(in, {0}, keepDims); };
         // Reduce over 1-dim input
         auto in = Variable(fl::rand({6}), true);
         ASSERT_TRUE(fl::detail::jacobianTestImpl(funcMean_0, in, 5E-3));
         // Reduce over scalar input
-        auto inScalar = Variable(fl::fromScalar(3.14), true);
+        auto inScalar = Variable(fl::fromScalar(3.14f), true);
         ASSERT_TRUE(fl::detail::jacobianTestImpl(funcMean_0, inScalar, 5E-3));
     }
 }
@@ -113,8 +107,8 @@ TEST(AutogradReductionTest, Mean) {
 TEST(AutogradReductionTest, Variance) {
     std::vector<bool> biased = {true, false};
     for(auto b : biased)
-        for(const bool keepDims : {false, true}) {
-            auto x = Variable(fl::rand({5, 6, 7, 8}, fl::dtype::f64), true);
+        for(bool keepDims : {false, true}) {
+            auto x = Variable(fl::rand({5, 6, 7, 8}, fl::dtype::f32), true);
 
             // TODO:{fl::Tensor} -- enforce AF versioning and remediate
             // Behavior of the bias parameter in af::var was changed in
@@ -128,9 +122,7 @@ TEST(AutogradReductionTest, Variance) {
             auto calculatedVar = var(x, {1}, b, keepDims);
             ASSERT_TRUE(allClose(calculatedVar.tensor(), expectedVar));
 
-            auto funcVar = [b, keepDims](Variable& in) {
-                    return var(in, {1, 2}, b, keepDims);
-                };
+            auto funcVar = [b, keepDims](Variable& in) { return var(in, {1, 2}, b, keepDims); };
             ASSERT_TRUE(fl::detail::jacobianTestImpl(funcVar, x, 1E-5, 1E-5));
         }
 }
@@ -138,17 +130,11 @@ TEST(AutogradReductionTest, Variance) {
 TEST(AutogradReductionTest, Norm) {
     auto x = Variable(fl::rand({5, 3}, fl::dtype::f64), true);
     for(const bool keepDims : {false, true}) {
-        auto funcNorm2 = [keepDims](Variable& in) {
-                return norm(in, {1}, 2, keepDims);
-            };
+        auto funcNorm2 = [keepDims](Variable& in) { return norm(in, {1}, 2, keepDims); };
         ASSERT_TRUE(fl::detail::jacobianTestImpl(funcNorm2, x, 1E-4));
-        auto funcNorm1 = [keepDims](Variable& in) {
-                return norm(in, {1}, 1, keepDims);
-            };
+        auto funcNorm1 = [keepDims](Variable& in) { return norm(in, {1}, 1, keepDims); };
         ASSERT_TRUE(fl::detail::jacobianTestImpl(funcNorm1, x, 1E-4));
-        auto funcNorm3 = [keepDims](Variable& in) {
-                return norm(in, {1}, 3, keepDims);
-            };
+        auto funcNorm3 = [keepDims](Variable& in) { return norm(in, {1}, 3, keepDims); };
         ASSERT_TRUE(fl::detail::jacobianTestImpl(funcNorm3, x, 1E-4));
     }
 }
diff --git a/flashlight/fl/test/autograd/AutogradRnnTest.cpp b/flashlight/fl/test/autograd/AutogradRnnTest.cpp
index 1195583..c9d8a09 100644
--- a/flashlight/fl/test/autograd/AutogradRnnTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradRnnTest.cpp
@@ -50,14 +50,14 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
     }
 
     auto w =
-        Variable(fl::rand({static_cast<long long>(nParams)}, precision), true);
+        Variable(fl::rand({static_cast<int64_t>(nParams)}, precision), true);
 
     auto funcRnnIn = [&](Variable& input) -> Variable {
             return std::get<0>(
                 rnn(
                     input,
-                    Variable().astype(precision),
-                    Variable().astype(precision),
+                    Variable().asType(precision),
+                    Variable().asType(precision),
                     w,
                     hiddenSize,
                     numLayers,
@@ -73,8 +73,8 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
             return std::get<0>(
                 rnn(
                     in,
-                    Variable().astype(precision),
-                    Variable().astype(precision),
+                    Variable().asType(precision),
+                    Variable().asType(precision),
                     weights,
                     hiddenSize,
                     numLayers,
@@ -98,8 +98,8 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
             return std::get<0>(
                 rnn(
                     in,
-                    hiddenState.astype(precision),
-                    Variable().astype(precision),
+                    hiddenState.asType(precision),
+                    Variable().asType(precision),
                     w,
                     hiddenSize,
                     numLayers,
@@ -116,8 +116,8 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
             return std::get<1>(
                 rnn(
                     input,
-                    Variable().astype(precision),
-                    Variable().astype(precision),
+                    Variable().asType(precision),
+                    Variable().asType(precision),
                     w,
                     hiddenSize,
                     numLayers,
@@ -144,8 +144,8 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 return std::get<0>(
                     rnn(
                         in,
-                        Variable().astype(precision),
-                        cellState.astype(precision),
+                        Variable().asType(precision),
+                        cellState.asType(precision),
                         w,
                         hiddenSize,
                         numLayers,
@@ -164,8 +164,8 @@ void testRnnImpl(RnnMode mode, fl::dtype precision = fl::dtype::f64) {
                 return std::get<2>(
                     rnn(
                         input,
-                        Variable().astype(precision),
-                        Variable().astype(precision),
+                        Variable().asType(precision),
+                        Variable().asType(precision),
                         w,
                         hiddenSize,
                         numLayers,
diff --git a/flashlight/fl/test/autograd/AutogradTest.cpp b/flashlight/fl/test/autograd/AutogradTest.cpp
index 3d53c77..e7220f8 100644
--- a/flashlight/fl/test/autograd/AutogradTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradTest.cpp
@@ -40,113 +40,96 @@ TEST(AutogradTest, AutogradOperatorTypeCompatibility) {
 
     // Binary operators
     EXPECT_THROW(
-        {
-            auto res = f16 + f32;
-        },
+        {auto res = f16 + f32;},
         std::invalid_argument
     ); // +
     EXPECT_THROW(
-        {
-            auto res = f16 - f32;
-        },
+        {auto res = f16 - f32;},
         std::invalid_argument
     ); // -
     EXPECT_THROW(
-        {
-            auto res = f16 * f32;
-        },
+        {auto res = f16 * f32;},
         std::invalid_argument
     ); // *
     EXPECT_THROW(
-        {
-            auto res = f16 / f32;
-        },
+        {auto res = f16 / f32;},
         std::invalid_argument
     ); ///
     EXPECT_THROW(
-        {
-            auto res = f16 > f32;
-        },
+        {auto res = f16 > f32;},
         std::invalid_argument
     ); // >
     EXPECT_THROW(
-        {
-            auto res = f16 < f32;
-        },
+        {auto res = f16 < f32;},
         std::invalid_argument
     ); // <
+    EXPECT_THROW({auto res = f16 >= f32;}, std::invalid_argument); // >=
     EXPECT_THROW(
         {
-            auto res = f16 >= f32;
-        },
-        std::invalid_argument
-    ); // >=
-    EXPECT_THROW(
-        {
-            auto res = f16 <= f32;
+        auto res = f16 <= f32;
         },
         std::invalid_argument
     ); // <=
     EXPECT_THROW(
         {
-            auto res = f16 && f32;
+        auto res = f16 && f32;
         },
         std::invalid_argument
     ); // &&
     EXPECT_THROW(
         {
-            max(f16, f32);
+        max(f16, f32);
         },
         std::invalid_argument
     ); // max
     EXPECT_THROW(
         {
-            min(f16, f32);
+        min(f16, f32);
         },
         std::invalid_argument
     ); // min
     EXPECT_THROW(
         {
-            matmul(f16, f32);
+        matmul(f16, f32);
         },
         std::invalid_argument
     ); // matmul
     EXPECT_THROW(
         {
-            matmulTN(f16, f32);
+        matmulTN(f16, f32);
         },
         std::invalid_argument
     ); // matmulTN
     EXPECT_THROW(
         {
-            matmulNT(f16, f32);
+        matmulNT(f16, f32);
         },
         std::invalid_argument
     ); // matmulNT
     EXPECT_NO_THROW(
         {
-            binaryCrossEntropy(f16, f32);
+        binaryCrossEntropy(f16, f32);
         }
     );
     EXPECT_NO_THROW(
         {
-            categoricalCrossEntropy(
-                Variable(fl::rand({7, 10, 4}, fl::dtype::f16), true),
-                Variable(
-                    (fl::rand({10, 4}, fl::dtype::u32) % 7).astype(fl::dtype::s32),
-                    false
-                )
-            );
+        categoricalCrossEntropy(
+            Variable(fl::rand({7, 10, 4}, fl::dtype::f16), true),
+            Variable(
+                (fl::rand({10, 4}, fl::dtype::u32) % 7).asType(fl::dtype::s32),
+                false
+            )
+        );
         }
     );
     EXPECT_NO_THROW(
         {
-            pool2d(f16, 1, 1, 1, 1, 1, 1);
+        pool2d(f16, 1, 1, 1, 1, 1, 1);
         }
     );
     EXPECT_NO_THROW(
         {
-            embedding(f16, f32);
+        embedding(f16, f32);
         }
     ); // lookup is of a different type
     // Ternary operators
@@ -154,13 +137,13 @@ TEST(AutogradTest, AutogradOperatorTypeCompatibility) {
     auto f16_2 = Variable(fl::rand({2, 2}, fl::dtype::f16), true);
     EXPECT_THROW(
         {
-            linear(f16, f32, f16_2);
+        linear(f16, f32, f16_2);
         },
         std::invalid_argument
     ); // linear
     EXPECT_THROW(
         {
-            linear(f16, f32, f32_2);
+        linear(f16, f32, f32_2);
         },
         std::invalid_argument
     ); // linear
@@ -168,19 +151,19 @@ TEST(AutogradTest, AutogradOperatorTypeCompatibility) {
     auto b = Variable(fl::rand({1}, fl::dtype::f32), true);
     EXPECT_THROW(
         {
-            batchnorm(f16, f32, f32_2, w, b, {1}, true, 0.01, 0.01);
+        batchnorm(f16, f32, f32_2, w, b, {1}, true, 0.01, 0.01);
         },
         std::invalid_argument
     );
     EXPECT_THROW(
         {
-            batchnorm(f16, f32, f16_2, w, b, {1}, true, 0.01, 0.01);
+        batchnorm(f16, f32, f16_2, w, b, {1}, true, 0.01, 0.01);
         },
         std::invalid_argument
     );
     EXPECT_THROW(
         {
-            conv2d(f16, f32, f16_2, 1, 1, 0, 0, 1, 1);
+        conv2d(f16, f32, f16_2, 1, 1, 0, 0, 1, 1);
         },
         std::invalid_argument
     );
@@ -189,17 +172,17 @@ TEST(AutogradTest, AutogradOperatorTypeCompatibility) {
     auto f16_4 = Variable(fl::rand({50}, fl::dtype::f16), false);
     EXPECT_THROW(
         {
-            rnn(
-                f16_3,
-                Variable(Tensor(fl::dtype::f32), false),
-                Variable(Tensor(fl::dtype::f32), false),
-                f16_4,
-                2,
-                2,
-                RnnMode::LSTM,
-                true,
-                0.0
-            );
+        rnn(
+            f16_3,
+            Variable(Tensor(fl::dtype::f32), false),
+            Variable(Tensor(fl::dtype::f32), false),
+            f16_4,
+            2,
+            2,
+            RnnMode::LSTM,
+            true,
+            0.0
+        );
         },
         std::invalid_argument
     );
@@ -207,7 +190,7 @@ TEST(AutogradTest, AutogradOperatorTypeCompatibility) {
     std::vector<Variable> concatInputs = {f16, f32, f16_2, f32_2};
     EXPECT_THROW(
         {
-            concatenate(concatInputs, 0);
+        concatenate(concatInputs, 0);
         },
         std::invalid_argument
     );
@@ -222,7 +205,7 @@ TEST(AutogradTest, CastingAsDifferentGradTypes) {
     // Computing gradients with mixed types fails when the op is applied
     ASSERT_THROW(
         {
-            f32 + f16;
+        f32 + f16;
         },
         std::invalid_argument
     );
@@ -233,10 +216,10 @@ TEST(AutogradTest, CastingAs) {
         GTEST_SKIP() << "Half-precision not supported on this device";
 
     auto var = Variable(fl::rand({5, 5}), true);
-    auto varF16 = var.astype(fl::dtype::f16);
+    auto varF16 = var.asType(fl::dtype::f16);
     ASSERT_EQ(var.type(), fl::dtype::f32);
     ASSERT_EQ(varF16.type(), fl::dtype::f16);
-    ASSERT_TRUE(allClose(varF16.tensor(), var.astype(fl::dtype::f16).tensor()));
+    ASSERT_TRUE(allClose(varF16.tensor(), var.asType(fl::dtype::f16).tensor()));
 }
 
 TEST(AutogradTest, CastingAsBackward) {
@@ -249,7 +232,7 @@ TEST(AutogradTest, CastingAsBackward) {
     c.backward();
     ASSERT_EQ(a.grad().type(), fl::dtype::f16);
     ASSERT_EQ(a.grad().type(), fl::dtype::f16);
-    a = a.astype(fl::dtype::f32);
+    a = a.asType(fl::dtype::f32);
     ASSERT_FALSE(a.isGradAvailable());
 }
 
@@ -258,21 +241,21 @@ TEST(AutogradTest, CastingAsGrad) {
         GTEST_SKIP() << "Half-precision not supported on this device";
 
     // compare to f32 case
-    auto x = Variable(fl::full({5}, 2.0), true);
-    auto y = Variable(fl::full({5}, 3.0), true);
+    auto x = Variable(fl::full({5}, 2.f), true);
+    auto y = Variable(fl::full({5}, 3.f), true);
     auto z = x * x + x * y + y * y;
-    auto dz = Variable(fl::full({5}, 1.0), false);
+    auto dz = Variable(fl::full({5}, 1.f), false);
     z.backward(dz);
     auto dx = x.grad();
     auto dy = y.grad();
 
     // f16 -- cast gradients in both directions
-    auto x32 = Variable(fl::full({5}, 2.0), true);
-    auto y32 = Variable(fl::full({5}, 3.0), true);
-    auto xf16 = x32.astype(fl::dtype::f16);
-    auto yf16 = y32.astype(fl::dtype::f16);
+    auto x32 = Variable(fl::full({5}, 2.f), true);
+    auto y32 = Variable(fl::full({5}, 3.f), true);
+    auto xf16 = x32.asType(fl::dtype::f16);
+    auto yf16 = y32.asType(fl::dtype::f16);
     auto zf16 = xf16 * xf16 + xf16 * yf16 + yf16 * yf16;
-    auto zf32 = zf16.astype(fl::dtype::f32);
+    auto zf32 = zf16.asType(fl::dtype::f32);
     zf32.backward(dz);
 
     ASSERT_EQ(xf16.grad().type(), fl::dtype::f16);
@@ -281,10 +264,10 @@ TEST(AutogradTest, CastingAsGrad) {
     ASSERT_EQ(x32.grad().type(), fl::dtype::f32);
     ASSERT_EQ(y32.grad().type(), fl::dtype::f32);
     ASSERT_TRUE(
-        allClose(dx.tensor(), xf16.grad().tensor().astype(fl::dtype::f32))
+        allClose(dx.tensor(), xf16.grad().tensor().asType(fl::dtype::f32))
     );
     ASSERT_TRUE(
-        allClose(dy.tensor(), y32.grad().tensor().astype(fl::dtype::f32))
+        allClose(dy.tensor(), y32.grad().tensor().asType(fl::dtype::f32))
     );
     ASSERT_TRUE(allClose(dx.tensor(), x32.grad().tensor()));
     ASSERT_TRUE(allClose(dy.tensor(), y32.grad().tensor()));
@@ -294,7 +277,7 @@ TEST(AutogradTest, NoCalcGrad) {
     auto x = Variable(fl::rand({5}), false);
     auto y = Variable(fl::rand({5}), true);
     auto z = x * x + x * y + y * y;
-    auto dz = Variable(fl::full({5}, 1.0), false);
+    auto dz = Variable(fl::full({5}, 1.f), false);
     z.backward(dz);
     auto dy = y.grad();
     ASSERT_TRUE(allClose(dy.tensor(), 2 * y.tensor() + x.tensor()));
@@ -311,33 +294,29 @@ TEST(AutogradTest, Concatenate) {
 
     ASSERT_EQ(output.shape(), Shape({2, 3, 12, 2}));
 
-    auto funcConcatenateT1 = [x2, x3, x4](Variable& in) {
-            return concatenate({in, x2, x3, x4}, 2);
-        };
+    auto funcConcatenateT1 = [x2, x3, x4](Variable& in) { return concatenate({in, x2, x3, x4}, 2); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT1, x1, 1E-5, 1E-4, {&x2, &x3, &x4}));
 
-    auto funcConcatenateT2 = [x1, x2, x4](Variable& in) {
-            return concatenate({x1, x2, in, x4}, 2);
-        };
+    auto funcConcatenateT2 = [x1, x2, x4](Variable& in) { return concatenate({x1, x2, in, x4}, 2); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcConcatenateT2, x3, 1E-5, 1E-4, {&x1, &x2, &x4}));
 }
 
 TEST(AutogradTest, Split) {
     // check output
-    auto x = Variable(fl::arange({7, 2}), true);
+    auto x = Variable(fl::arrange({7, 2}), true);
     auto yVec = split(x, 1, 0);
     ASSERT_EQ(yVec.size(), 7);
     ASSERT_EQ(yVec[0].shape(), Shape({1, 2}));
     ASSERT_EQ(yVec[2].shape(), Shape({1, 2}));
     ASSERT_TRUE(fl::all(yVec[6].tensor() == 6).scalar<char>());
 
-    auto a = Variable(fl::arange({5, 3}, 1), true);
+    auto a = Variable(fl::arrange({5, 3}, 1), true);
     auto bVec = split(a, {2, 1}, 1);
     ASSERT_EQ(bVec.size(), 2);
     ASSERT_EQ(bVec[0].shape(), Shape({5, 2}));
     ASSERT_EQ(bVec[1].shape(), Shape({5, 1}));
     ASSERT_TRUE(
-        fl::all(bVec[0].tensor() == fl::arange({5, 2}, 1)).scalar<char>()
+        fl::all(bVec[0].tensor() == fl::arrange({5, 2}, 1)).scalar<char>()
     );
     ASSERT_TRUE(fl::all(bVec[1].tensor() == 2).scalar<char>());
 
@@ -354,7 +333,7 @@ TEST(AutogradTest, Tile) {
     auto x = Variable(fl::rand({6}), true);
     auto y = Variable(fl::rand({6, 3}), true);
     auto z = y * tile(x, {1, 3});
-    auto dz = Variable(fl::full({6, 3}, 1.0), false);
+    auto dz = Variable(fl::full({6, 3}, 1.f), false);
     z.backward(dz);
     auto dy = y.grad();
     auto dx = x.grad();
@@ -371,7 +350,7 @@ TEST(AutogradTest, TileAs) {
     auto x = Variable(fl::rand({5}), true);
     auto y = Variable(fl::rand({5, 2}), true);
     auto z = y * tileAs(x, y);
-    auto dz = Variable(fl::full({5, 2}, 1.0), false);
+    auto dz = Variable(fl::full({5, 2}, 1.f), false);
     z.backward(dz);
     auto dy = y.grad();
     auto dx = x.grad();
@@ -387,29 +366,29 @@ TEST_F(AutogradTestF16, TileAsF16) {
     auto y = Variable(fl::rand({5, 2}, fl::dtype::f16), true);
     auto z = y * tileAs(x, y);
     ASSERT_EQ(x.type(), z.type());
-    auto dz = Variable(fl::full({5, 2}, 1.0, fl::dtype::f16), false);
+    auto dz = Variable(fl::full({5, 2}, 1.f, fl::dtype::f16), false);
     z.backward(dz);
     auto dy = y.grad();
     auto dx = x.grad();
     ASSERT_TRUE(
         allClose(
             dy.tensor(),
-            fl::tile(x.tensor(), {1, 2}).astype(dx.type()),
+            fl::tile(x.tensor(), {1, 2}).asType(dx.type()),
             1e-2
         )
     );
     ASSERT_TRUE(
-        allClose(dx.tensor(), fl::sum(y.tensor(), {1}).astype(dx.type()), 1e-2)
+        allClose(dx.tensor(), fl::sum(y.tensor(), {1}).asType(dx.type()), 1e-2)
     );
 }
 
 TEST(AutogradTest, TileAs2) {
     auto x = Variable(fl::rand({10}), true);
     auto z = tileAs(x, Shape({10, 3}));
-    auto dz = Variable(fl::full({10, 3}, 1.0), false);
+    auto dz = Variable(fl::full({10, 3}, 1.f), false);
     z.backward(dz);
     auto dx = x.grad();
-    ASSERT_TRUE(allClose(dx.tensor(), fl::full(x.shape(), 3.0)));
+    ASSERT_TRUE(allClose(dx.tensor(), fl::full(x.shape(), 3.f)));
 }
 
 TEST(AutogradTest, Indexing) {
@@ -421,34 +400,24 @@ TEST(AutogradTest, Indexing) {
     auto funcRow = [](Variable& input) { return input(4); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRow, x));
 
-    auto funcSlice = [](Variable& input) {
-            return input(fl::span, fl::span, 4);
-        };
+    auto funcSlice = [](Variable& input) { return input(fl::span, fl::span, 4); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcSlice, x));
 
-    auto funcCols = [](Variable& input) {
-            return input(fl::span, fl::range(2, 5));
-        };
+    auto funcCols = [](Variable& input) { return input(fl::span, fl::range(2, 5)); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcCols, x));
 
     auto funcRows = [](Variable& input) { return input(fl::range(2, 5)); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcRows, x));
 
-    auto funcSlices = [](Variable& input) {
-            return input(fl::span, fl::span, fl::range(2, 5));
-        };
+    auto funcSlices = [](Variable& input) { return input(fl::span, fl::span, fl::range(2, 5)); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcSlices, x));
-    auto funcFlat = [](Variable& input) {
-            return input.flat(fl::range(4, 100));
-        };
+    auto funcFlat = [](Variable& input) { return input.flat(fl::range(4, 100)); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcFlat, x));
 }
 
 TEST(AutogradTest, Padding) {
     auto in = Variable(fl::rand({3, 3}, fl::dtype::f32), true);
-    auto funcPad = [&](Variable& input) {
-            return padding(input, {{1, 2}, {0, 1}}, -1);
-        };
+    auto funcPad = [&](Variable& input) { return padding(input, {{1, 2}, {0, 1}}, -1); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcPad, in, 1E-3));
 }
 
@@ -462,27 +431,26 @@ TEST_F(AutogradTestF16, PoolingF16) {
     if(!fl::f16Supported())
         GTEST_SKIP() << "Half-precision not supported on this device";
 
-    const float inputScale = 2.0; // scale the input to prevent grad underflow
+    float const inputScale = 2.f; // scale the input to prevent grad underflow
     auto in = Variable(inputScale * fl::rand({3, 3, 1, 1}, fl::dtype::f16), true);
     auto funcPool = [&](Variable& input) { return pool2d(input, 2, 2, 1, 1); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcPool, in, 1e1, 1e-1)); // TODO: investigate
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcPool, in, 1e-2, 1e-1)); // TODO: investigate
 }
 
 TEST(AutogradTest, Reorder) {
     auto in = Variable(fl::rand({3, 1, 4, 1}, fl::dtype::f32) * 2, true);
-    auto funcReorder = [&](Variable& input) {
-            return reorder(input, {2, 0, 3, 1});
-        };
+    auto funcReorder = [&](Variable& input) { return reorder(input, {2, 0, 3, 1}); };
     ASSERT_TRUE(fl::detail::jacobianTestImpl(funcReorder, in, 1E-3));
 }
 
 TEST(AutogradTest, Embedding) {
     int nWords = 10;
-    auto input =
-        Variable((fl::rand({4, 2}) * nWords).astype(fl::dtype::f32), false);
-    auto weights = Variable(fl::randn({4, nWords}, fl::dtype::f64), true);
-    auto funcEmbed = [&](Variable& w) { return embedding(input, w); };
-    ASSERT_TRUE(fl::detail::jacobianTestImpl(funcEmbed, weights, 1E-5));
+    auto input = Variable((fl::rand({4, 2}) * nWords).asType(fl::dtype::s32), false);
+
+    auto weights = Variable(fl::randn({4, nWords}, fl::dtype::f64), true); 
+    auto func_embed = [&](Variable& w) { return embedding(input, w); };
+
+    ASSERT_TRUE(fl::detail::jacobianTestImpl(func_embed, weights, 1E-5));
 }
 
 TEST(AutogradTest, GetAdvancedIndex) {
@@ -491,7 +459,10 @@ TEST(AutogradTest, GetAdvancedIndex) {
         GTEST_SKIP()
             << "Advanced indexing operator unsupported for non-CUDA backends";
     std::vector<fl::dtype> validIndexTypes = {
-        fl::dtype::s32, fl::dtype::s64, fl::dtype::u32, fl::dtype::u64
+        fl::dtype::s32,
+        fl::dtype::s64,
+        fl::dtype::u32,
+        fl::dtype::u64
     };
     for(const auto& dtype : validIndexTypes) {
         auto x = Variable(fl::rand({20, 50, 40, 30}, fl::dtype::f32), true);
@@ -523,7 +494,10 @@ TEST(AutogradTest, GetAdvancedIndexF16) {
     if(!fl::f16Supported())
         GTEST_SKIP() << "Half-precision not supported on this device";
     std::vector<fl::dtype> validIndexTypes = {
-        fl::dtype::s32, fl::dtype::s64, fl::dtype::u32, fl::dtype::u64
+        fl::dtype::s32,
+        fl::dtype::s64,
+        fl::dtype::u32,
+        fl::dtype::u64
     };
     for(const auto& dtype : validIndexTypes) {
         auto x = Variable(fl::rand({20, 50, 40, 30}, fl::dtype::f16), true);
diff --git a/flashlight/fl/test/autograd/AutogradTestUtils.h b/flashlight/fl/test/autograd/AutogradTestUtils.h
index 6c4cf55..43f2a5c 100644
--- a/flashlight/fl/test/autograd/AutogradTestUtils.h
+++ b/flashlight/fl/test/autograd/AutogradTestUtils.h
@@ -10,10 +10,11 @@
 #include "gtest/gtest.h"
 
 #include "flashlight/fl/autograd/Functions.h"
+#include "flashlight/fl/autograd/Variable.h"
+#include "flashlight/fl/tensor/Compute.h"
 #include "flashlight/fl/tensor/Index.h"
 #include "flashlight/fl/tensor/Init.h"
 #include "flashlight/fl/tensor/Random.h"
-#include "flashlight/fl/autograd/Variable.h"
 
 namespace fl {
 namespace detail {
@@ -24,51 +25,83 @@ namespace detail {
             OptimMode::get().setOptimLevel(OptimLevel::O3);
         }
 
-        void TearDown() override {
-            OptimMode::get().setOptimLevel(OptimLevel::DEFAULT);
-        }
+        void TearDown() override { OptimMode::get().setOptimLevel(OptimLevel::DEFAULT); }
     };
 
     using JacobianFunc = std::function<Variable (Variable&)>;
     inline bool jacobianTestImpl(
-        const JacobianFunc& func,
+        JacobianFunc const& func,
         Variable& input,
-        float precision = 1E-5,
+        double precision = 1E-5,
         float perturbation = 1E-4,
-        const std::vector<Variable*>& zeroGradientVariables = {}) {
-        auto fwdJacobian =
-            Tensor({func(input).elements(), input.elements()}, fl::dtype::f32);
+        std::vector<Variable*> const& zeroGradientVariables = {}
+    ) {
+        auto const outBase = func(input);
+        auto const outElements = outBase.elements();
+        auto const inElements = input.elements();
 
-        for(int i = 0; i < input.elements(); ++i) {
-            Tensor orig = input.tensor().flatten()(i);
+        auto const fwdJacobian = Tensor({outElements, inElements}, input.type());
+
+        for(int i = 0; i < inElements; ++i) {
+            auto orig = input.tensor().flatten()(i);
             input.tensor().flat(i) = orig - perturbation;
-            auto outa = func(input).tensor();
+            auto outA = func(input).tensor();
 
             input.tensor().flat(i) = orig + perturbation;
-            auto outb = func(input).tensor();
+            auto outB = func(input).tensor();
+
             input.tensor().flat(i) = orig;
 
-            fwdJacobian(fl::span, i) =
-                fl::reshape((outb - outa), {static_cast<Dim>(outa.elements())}) * 0.5
-                / perturbation;
+
+            fwdJacobian(fl::span, i) = fl::reshape((outB - outA), {static_cast<Dim>(outA.elements())}) * 0.5 /
+                perturbation;
         }
 
-        auto bwdJacobian =
-            Tensor({func(input).elements(), input.elements()}, fl::dtype::f32);
-        auto dout =
-            Variable(fl::full(func(input).shape(), 0, func(input).type()), false);
+        auto const bwdJacobian = Tensor({outElements, inElements}, input.type());
+        auto const outD = Variable(fl::full(outBase.shape(), 0, outBase.type()), false);
 
-        for(int i = 0; i < dout.elements(); ++i) {
-            dout.tensor().flat(i) = 1; // element in 1D view
+        for(int i = 0; i < outD.elements(); ++i) {
+            outD.tensor().flat(i) = 1; // element in 1D view
             input.zeroGrad();
             for(auto* var : zeroGradientVariables)
                 var->zeroGrad();
             auto out = func(input);
-            out.backward(dout);
+            out.backward(outD);
 
-            bwdJacobian(i) = fl::reshape(input.grad().tensor(), {input.elements()});
-            dout.tensor().flat(i) = 0;
+            bwdJacobian(i) = fl::reshape(input.grad().tensor(), {inElements});
+            outD.tensor().flat(i) = 0;
         }
+
+
+        if(fwdJacobian.type() == dtype::f64 && fwdJacobian.elements() == 1280) {
+            auto const flat_tensor_print = [](Tensor t, std::string_view name) {
+                std::span host{t.host<double>(), t.elements()};
+
+
+                std::cout << std::format("{}:\n[", name);
+                for(size_t i = 0; i < host.size();) {
+                    size_t c = 1;
+                    while(i + c < host.size() && host[i] == host[i + c])
+                        c++;
+
+                    if(c == 1)
+                        std::cout << host[i];
+                    else
+                        std::cout << std::format("({})_{}", host[i], c);
+                    if(i + c < host.size() - 1)
+                        std::cout << ", ";
+
+                    i += c;
+                }
+
+                std::cout << "]\n\n\n";
+            };
+
+
+            flat_tensor_print(fwdJacobian, "fwd");
+            flat_tensor_print(bwdJacobian, "bwd");
+        }
+
         return allClose(fwdJacobian, bwdJacobian, precision);
     }
 
diff --git a/flashlight/fl/test/autograd/AutogradUnaryOpsTest.cpp b/flashlight/fl/test/autograd/AutogradUnaryOpsTest.cpp
index dddba75..9907d7e 100644
--- a/flashlight/fl/test/autograd/AutogradUnaryOpsTest.cpp
+++ b/flashlight/fl/test/autograd/AutogradUnaryOpsTest.cpp
@@ -42,7 +42,7 @@ TEST(AutogradUnaryOpsTest, Glu) {
 TEST(AutogradUnaryOpsTest, Sigmoid) {
     auto x = Variable(fl::rand({5}), true);
     auto y = sigmoid(x);
-    auto dy = Variable(fl::full({5}, 1.0), false);
+    auto dy = Variable(fl::full({5}, 1.f), false);
     y.backward(dy);
     auto dx = x.grad();
     ASSERT_TRUE(allClose(dx.tensor(), (y.tensor() * (1 - y.tensor()))));
@@ -59,7 +59,7 @@ TEST(AutogradUnaryOpsTest, Erf) {
     auto y = erf(x);
     ASSERT_TRUE(allClose(fl::erf(x.tensor()), y.tensor()));
 
-    auto dy = Variable(fl::full({5}, 1.0), false);
+    auto dy = Variable(fl::full({5}, 1.f), false);
     y.backward(dy);
     auto targetGrads = 2 / std::sqrt(M_PI) * exp(negate(x * x));
     auto dx = x.grad();
@@ -72,7 +72,7 @@ TEST(AutogradUnaryOpsTest, Erf) {
 TEST(AutogradUnaryOpsTest, Tanh) {
     auto x = Variable(fl::rand({5}), true);
     auto y = tanh(x);
-    auto dy = Variable(fl::full({5}, 1.0), false);
+    auto dy = Variable(fl::full({5}, 1.f), false);
     y.backward(dy);
     auto dx = x.grad();
     ASSERT_TRUE(allClose(dx.tensor(), (1 - y.tensor() * y.tensor())));
@@ -105,7 +105,7 @@ TEST(AutogradUnaryOpsTest, Transpose) {
 TEST(AutogradUnaryOpsTest, Exp) {
     auto x = Variable(fl::rand({5}), true);
     auto y = exp(x);
-    auto dy = Variable(fl::full({5}, 1.0), false);
+    auto dy = Variable(fl::full({5}, 1.f), false);
     y.backward(dy);
     auto dx = x.grad();
     ASSERT_TRUE(allClose(dx.tensor(), (fl::exp(x.tensor()))));
@@ -164,7 +164,7 @@ TEST(AutogradUnaryOpsTest, Pow) {
     {
         auto x = Variable(fl::rand({5}), true);
         auto y = pow(x, 2);
-        auto dy = Variable(fl::full({5}, 2.0), false);
+        auto dy = Variable(fl::full({5}, 2.f), false);
         y.backward(dy);
         auto dx = x.grad();
         ASSERT_TRUE(allClose(dx.tensor(), (2 * 2 * x.tensor())));
@@ -172,7 +172,7 @@ TEST(AutogradUnaryOpsTest, Pow) {
     {
         auto x = Variable(fl::rand({5}), true);
         auto y = pow(x, 3);
-        auto dy = Variable(fl::full({5}, 1.0), false);
+        auto dy = Variable(fl::full({5}, 1.f), false);
         y.backward(dy);
         auto dx = x.grad();
         ASSERT_TRUE(allClose(dx.tensor(), (3 * fl::power(x.tensor(), 2))));
diff --git a/flashlight/fl/test/contrib/modules/ContribModuleTest.cpp b/flashlight/fl/test/contrib/modules/ContribModuleTest.cpp
index 8b55692..e5a743f 100644
--- a/flashlight/fl/test/contrib/modules/ContribModuleTest.cpp
+++ b/flashlight/fl/test/contrib/modules/ContribModuleTest.cpp
@@ -25,9 +25,7 @@ class ContribModuleTestF16 : public ::testing::Test {
         OptimMode::get().setOptimLevel(OptimLevel::O3);
     }
 
-    void TearDown() override {
-        OptimMode::get().setOptimLevel(OptimLevel::DEFAULT);
-    }
+    void TearDown() override { OptimMode::get().setOptimLevel(OptimLevel::DEFAULT); }
 };
 
 } // namespace
@@ -140,14 +138,15 @@ void transformerPadMaskFwd(bool isfp16) {
     int nheads = 2;
     auto dtype = isfp16 ? fl::dtype::f16 : fl::dtype::f32;
 
-    auto tr =
-        Transformer(c, c / nheads, c, nheads, timesteps, 0, 0, false, false);
+    auto tr = Transformer(c, c / nheads, c, nheads, timesteps, 0, 0, false, false);
     auto input1 = Variable(fl::rand({c, timesteps, /* B = */ 1}, dtype), false);
     auto input1NoPad = input1(fl::span, fl::range(0, timesteps / 2));
     auto input2 = Variable(fl::rand({c, timesteps, /* B = */ 1}, dtype), false);
     auto input = fl::concatenate({input1, input2}, 2);
     auto padMask = fl::full({timesteps, 2}, 1);
+
     padMask(fl::iota({timesteps / 2}) + timesteps / 2, 0) = 0;
+    
     auto noPadMask = fl::full({timesteps, 2}, 1);
 
     auto output = tr.forward({input, Variable(padMask, false)}).front();
@@ -159,43 +158,44 @@ void transformerPadMaskFwd(bool isfp16) {
 
     if(OptimMode::get().getOptimLevel() == OptimLevel::O3)
         ASSERT_EQ(outputNoPad.type(), input.type());
-    else {
+    else
         ASSERT_EQ(outputNoPad.type(), fl::dtype::f32); // result is upcast
-    }
 
     auto output1 = tr.forward(
-        {
-            input1NoPad,
-            Variable(
-                padMask(fl::range(0, timesteps / 2))(
-                    fl::span,
-                    fl::range(0, 1)
-                ),
-                false
-            )
-        }
-    )
-        .front();
-    auto output2 =
-        tr.forward({input2, Variable(padMask(fl::span, fl::range(1, 2)), false)})
-        .front();
-    ASSERT_TRUE(allClose(
-        output.tensor()(fl::span, fl::span, fl::range(1, 2)), output2.tensor()));
-        ASSERT_TRUE(allClose(
-            outputNoPad.tensor()(fl::span, fl::span, fl::range(1, 2)),
-            output2.tensor()));
-            ASSERT_TRUE(allClose(
-                output.tensor()(fl::span, fl::iota({timesteps / 2}), fl::range(0, 1)),
-                output1.tensor()));
-                ASSERT_FALSE(allClose(
-                    outputNoPad.tensor()(
-                        fl::span, fl::iota({timesteps / 2}), fl::range(0, 1)),
-                    output1.tensor()));
+                         {
+                             input1NoPad,
+                             Variable(
+                                 padMask(fl::range(0, timesteps / 2))(
+                                     fl::span,
+                                     fl::range(0, 1)
+                                 ),
+                                 false
+                             )
+                         }
+                     )
+                     .front();
+
+    auto output2 = tr.forward({input2, Variable(padMask(fl::span, fl::range(1, 2)), false)})
+                     .front();
+    ASSERT_TRUE(
+        allClose(output.tensor()(fl::span, fl::span, fl::range(1, 2)), output2.tensor())
+    );
+    ASSERT_TRUE(
+        allClose(outputNoPad.tensor()(fl::span, fl::span, fl::range(1, 2)), output2.tensor())
+    );
+    ASSERT_TRUE(
+        allClose(output.tensor()(fl::span, fl::iota({timesteps / 2}), fl::range(0, 1)), output1.tensor())
+    );
+    ASSERT_FALSE(
+        allClose(
+            outputNoPad.tensor()(
+                fl::span, fl::iota({timesteps / 2}), fl::range(0, 1)),
+            output1.tensor()
+        )
+    );
 }
 
-TEST(ContribModuleTest, TransformerPadMaskFwd) {
-    transformerPadMaskFwd(false);
-}
+TEST(ContribModuleTest, TransformerPadMaskFwd) { transformerPadMaskFwd(false); }
 
 TEST_F(ContribModuleTestF16, TransformerPadMaskFwd16) {
     if(!fl::f16Supported())
@@ -233,9 +233,7 @@ void transformerFwd(bool isfp16) {
     ASSERT_TRUE(allClose(output1, output2, 1E-7));
 }
 
-TEST(ContribModuleTest, TransformerFwd) {
-    transformerFwd(false);
-}
+TEST(ContribModuleTest, TransformerFwd) { transformerFwd(false); }
 
 TEST_F(ContribModuleTestF16, TransformerFwdF16) {
     if(!fl::f16Supported())
@@ -265,9 +263,7 @@ void conformerFwd(bool isfp16) {
     ASSERT_EQ(output[0].dim(2), batchsize);
 }
 
-TEST(ContribModuleTest, ConformerFwd) {
-    conformerFwd(false);
-}
+TEST(ContribModuleTest, ConformerFwd) { conformerFwd(false); }
 
 TEST_F(ContribModuleTestF16, ConformerFwdF16) {
     if(!fl::f16Supported())
@@ -293,9 +289,7 @@ void positionEmbeddingFwd(bool isfp16) {
     ASSERT_FALSE(allClose(output[0], input));
 }
 
-TEST(ContribModuleTest, PositionEmbeddingFwd) {
-    positionEmbeddingFwd(false);
-}
+TEST(ContribModuleTest, PositionEmbeddingFwd) { positionEmbeddingFwd(false); }
 
 TEST_F(ContribModuleTestF16, PositionEmbeddingFwdF16) {
     if(!fl::f16Supported())
@@ -320,14 +314,12 @@ void sinusoidalPositionEmbeddingFwd(bool isfp16) {
     ASSERT_EQ(output[0].dim(2), batchsize);
     auto castOutput = output[0].tensor();
     if(isfp16)
-        castOutput = output[0].astype(fl::dtype::f32).tensor();
+        castOutput = output[0].asType(fl::dtype::f32).tensor();
     ASSERT_TRUE((fl::amax(castOutput, {0})).scalar<float>() <= 2);
     ASSERT_TRUE((fl::amin(castOutput, {0})).scalar<float>() >= -2);
 }
 
-TEST(ContribModuleTest, SinusoidalPositionEmbeddingFwd) {
-    sinusoidalPositionEmbeddingFwd(false);
-}
+TEST(ContribModuleTest, SinusoidalPositionEmbeddingFwd) { sinusoidalPositionEmbeddingFwd(false); }
 
 TEST_F(ContribModuleTestF16, SinusoidalPositionEmbeddingFwdF16) {
     if(!fl::f16Supported())
@@ -366,9 +358,7 @@ void tdsFwd(bool isfp16) {
     ASSERT_EQ(output.type(), input.type());
 }
 
-TEST(ContribModuleTest, TDSFwd) {
-    tdsFwd(false);
-}
+TEST(ContribModuleTest, TDSFwd) { tdsFwd(false); }
 
 TEST_F(ContribModuleTestF16, TDSFwdF16) {
     if(!fl::f16Supported())
@@ -397,9 +387,7 @@ void streamingTDSFwd(bool isfp16) {
     ASSERT_EQ(output.type(), input.type());
 }
 
-TEST(ContribModuleTest, StreamingTDSFwd) {
-    streamingTDSFwd(false);
-}
+TEST(ContribModuleTest, StreamingTDSFwd) { streamingTDSFwd(false); }
 
 TEST_F(ContribModuleTestF16, StreamingTDSFwdF16) {
     if(!fl::f16Supported())
@@ -431,7 +419,7 @@ TEST(ContribModuleTest, SpecAugmentFwd) {
     int tZeros = 0;
     for(int t = 0; t < T; ++t) {
         auto curOutSlice = output.tensor()(t);
-        tZeros = fl::all(curOutSlice == 0).asScalar<bool>() ? tZeros + 1 : tZeros;
+        tZeros = fl::all_of(curOutSlice == 0).asScalar<bool>() ? tZeros + 1 : tZeros;
     }
     ASSERT_GT(tZeros, 0);
 
@@ -439,7 +427,7 @@ TEST(ContribModuleTest, SpecAugmentFwd) {
     int fZeros = 0;
     for(int f = 0; f < F; ++f) {
         auto curOutSlice = output.tensor()(fl::span, f);
-        fZeros = fl::all(curOutSlice == 0).asScalar<bool>() ? fZeros + 1 : fZeros;
+        fZeros = fl::all_of(curOutSlice == 0).asScalar<bool>() ? fZeros + 1 : fZeros;
     }
     ASSERT_GT(fZeros, 0);
 }
@@ -448,7 +436,18 @@ void computeRawWavSpecAug(bool isfp16, float epsilon) {
     // no time, only freq masking
     for(int nmask = 1; nmask < 3; nmask++) {
         RawWavSpecAugment specAug(
-            0, 1, nmask, 0, 0, 0, 1, 2000, 6000, 16000, 20000);
+            0,
+            1,
+            nmask,
+            0,
+            0,
+            0,
+            1,
+            2000,
+            6000,
+            16000,
+            20000
+        );
         specAug.train();
 
         int T = 300, C = 3, B = 4;
@@ -460,8 +459,8 @@ void computeRawWavSpecAug(bool isfp16, float epsilon) {
         inputWav = fl::tile(inputWav, {1, C, B});
         finalWav = fl::tile(finalWav, {1, C, B});
         if(isfp16) {
-            inputWav = inputWav.astype(fl::dtype::f16);
-            finalWav = finalWav.astype(fl::dtype::f16);
+            inputWav = inputWav.asType(fl::dtype::f16);
+            finalWav = finalWav.asType(fl::dtype::f16);
         }
 
         auto filteredWav = specAug(fl::Variable(inputWav, false));
@@ -480,9 +479,7 @@ void computeRawWavSpecAug(bool isfp16, float epsilon) {
     }
 }
 
-TEST(ContribModuleTest, RawWavSpecAugmentFwd) {
-    computeRawWavSpecAug(false, 1e-3);
-}
+TEST(ContribModuleTest, RawWavSpecAugmentFwd) { computeRawWavSpecAug(false, 1e-3); }
 
 TEST_F(ContribModuleTestF16, RawWavSpecAugmentFwdF16) {
     if(!fl::f16Supported())
diff --git a/flashlight/fl/test/nn/ModuleTest.cpp b/flashlight/fl/test/nn/ModuleTest.cpp
index cb87cb9..5cbf3b9 100644
--- a/flashlight/fl/test/nn/ModuleTest.cpp
+++ b/flashlight/fl/test/nn/ModuleTest.cpp
@@ -157,7 +157,7 @@ TEST_F(ModuleTestF16, LinearFwdF16) {
             {n_in, x, batchsize},
             {6, 2, 1, 4, 8, 2, 7, 1, 10, 7, 3, 7, 5, 9, 2, 4}
         )
-        .astype(fl::dtype::f16)
+        .asType(fl::dtype::f16)
     );
 
     auto expected_outVar = Variable(
@@ -168,7 +168,7 @@ TEST_F(ModuleTestF16, LinearFwdF16) {
                 150, 55, 41, 94, 41, 27, 130, 55, 37, 56, 24, 16
             }
         )
-        .astype(fl::dtype::f16),
+        .asType(fl::dtype::f16),
         true
     );
 
@@ -187,7 +187,7 @@ TEST_F(ModuleTestF16, LinearFwdF16) {
                 151, 57, 44, 95, 43, 30, 131, 57, 40, 57, 26, 19
             }
         )
-        .astype(inVar.type()),
+        .asType(inVar.type()),
         true
     );
 
@@ -197,7 +197,7 @@ TEST_F(ModuleTestF16, LinearFwdF16) {
     ASSERT_TRUE(allClose(resultBias, expected_outVar, 1E-3));
 
     // OptimLevel::O3 is active with this fixture
-    ASSERT_EQ(linBias.forward(inVar.astype(fl::dtype::f32)).type(), fl::dtype::f16);
+    ASSERT_EQ(linBias.forward(inVar.asType(fl::dtype::f32)).type(), fl::dtype::f16);
 }
 
 TEST(ModuleTest, ConvPadding) {
@@ -265,13 +265,13 @@ TEST_F(ModuleTestF16, GLUFwdF16) {
 
     auto inVar = Variable(
         Tensor::fromVector<float>({3, 2}, {0.8, 0.2, 0.2, 0.1, 0.5, 0.3})
-        .astype(fl::dtype::f16),
+        .asType(fl::dtype::f16),
         true
     );
 
     auto expected_outVar = Variable(
         Tensor::fromVector<float>({3, 1}, {0.419983, 0.124492, 0.114888})
-        .astype(fl::dtype::f16),
+        .asType(fl::dtype::f16),
         true
     );
 
@@ -282,7 +282,7 @@ TEST_F(ModuleTestF16, GLUFwdF16) {
 
     // test batching
     int batchsize = 5;
-    inVar = Variable(fl::rand({10, 7, batchsize}).astype(fl::dtype::f16), true);
+    inVar = Variable(fl::rand({10, 7, batchsize}).asType(fl::dtype::f16), true);
     glu = GatedLinearUnit(0);
 
     auto batchOutVar = glu(inVar);
@@ -346,7 +346,7 @@ TEST_F(ModuleTestF16, LogSoftmaxFwdF16) {
 
     auto inVar = Variable(
         Tensor::fromVector<float>({3, 2}, {0.8, 0.2, 0.2, 0.1, 0.5, 0.3})
-        .astype(fl::dtype::f16),
+        .asType(fl::dtype::f16),
         true
     );
 
@@ -671,7 +671,7 @@ TEST_F(ModuleTestF16, RNNFwdF16) {
         ),
         true
     );
-    ASSERT_TRUE(allClose(out, expected_outVar.astype(in.type()), 5E-2));
+    ASSERT_TRUE(allClose(out, expected_outVar.asType(in.type()), 5E-2));
 }
 
 TEST(ModuleTest, ViewFwd) {
@@ -708,7 +708,7 @@ TEST_F(ModuleTestF16, DropoutFwdF16) {
     if(!fl::f16Supported())
         GTEST_SKIP() << "Half-precision not supported on this device";
 
-    auto module = Dropout(0.5);
+    auto module = Dropout(0.5f);
     // Train Mode
     module.train();
     auto in = Variable(fl::rand({1000, 1000}, fl::dtype::f16), true);
@@ -723,7 +723,7 @@ TEST_F(ModuleTestF16, DropoutFwdF16) {
 
     ASSERT_GT(
         fl::amax(out.tensor()).asScalar<float>(),
-        1.5
+        1.5f
     ); // Check input is scaled
 
     // Eval Mode
@@ -806,8 +806,8 @@ TEST_F(ModuleTestF16, LayerNormFwdF16) {
 
     auto sample_mean = mean(input, {3});
     auto sample_var = var(input, {3}, true);
-    auto true_out = (input - tileAs(sample_mean, input).astype(input.type()))
-        / tileAs(fl::sqrt(sample_var + eps), input).astype(input.type());
+    auto true_out = (input - tileAs(sample_mean, input).asType(input.type()))
+        / tileAs(fl::sqrt(sample_var + eps), input).asType(input.type());
 
     // no affine transform
     auto module1 = LayerNorm(feat_axes, eps, false);
@@ -815,13 +815,13 @@ TEST_F(ModuleTestF16, LayerNormFwdF16) {
     module1.train();
     auto out = module1.forward(input);
 
-    ASSERT_TRUE(allClose(out, true_out.astype(out.type()), eps));
+    ASSERT_TRUE(allClose(out, true_out.asType(out.type()), eps));
 
     module1.eval();
     out = module1.forward(input);
 
     ASSERT_TRUE(
-        allClose(out.tensor(), true_out.tensor().astype(out.type()), eps)
+        allClose(out.tensor(), true_out.tensor().asType(out.type()), eps)
     );
 
     // with affine transform
@@ -863,12 +863,12 @@ TEST(ModuleTest, PrecisionCastFwd) {
     if(!fl::f16Supported())
         GTEST_SKIP() << "Half precision not available on this device";
 
-    auto in = Variable(fl::full({3, 3}, 1.0), true);
+    auto in = Variable(fl::full({3, 3}, 1.f), true);
     auto precisionCast = PrecisionCast(fl::dtype::f16);
     auto out = precisionCast.forward(in);
 
     ASSERT_EQ(out.type(), fl::dtype::f16);
-    ASSERT_TRUE(allClose(in.tensor(), out.astype(fl::dtype::f32).tensor()));
+    ASSERT_TRUE(allClose(in.tensor(), out.asType(fl::dtype::f32).tensor()));
 }
 
 TEST(ModuleTest, ContainerReplaceParam) {
@@ -912,7 +912,7 @@ TEST(ModuleTest, AdaptiveSoftMaxPredict) {
 
     auto x = input(fl::rand({N, T, B}, fl::dtype::f32));
     auto y = Variable(
-        (fl::rand({T, B}, fl::dtype::u32) % C).astype(fl::dtype::s32),
+        (fl::rand({T, B}, fl::dtype::u32) % C).asType(fl::dtype::s32),
         false
     );
 
@@ -935,7 +935,7 @@ TEST(ModuleTest, AdaptiveSoftMaxLossBatchFwd) {
 
     auto x = input(fl::rand({N, T, B}, fl::dtype::f32));
     auto y = Variable(
-        (fl::rand({T, B}, fl::dtype::u32) % C).astype(fl::dtype::s32),
+        (fl::rand({T, B}, fl::dtype::u32) % C).asType(fl::dtype::s32),
         false
     );
 
@@ -966,7 +966,7 @@ TEST(ModuleTest, AdaptiveSoftMaxLossIgnoreIndex) {
 
     auto x = input(fl::rand({N, T, B}, fl::dtype::f32));
     auto y = Variable(
-        (fl::rand({T, B}, fl::dtype::u32) % C).astype(fl::dtype::s32),
+        (fl::rand({T, B}, fl::dtype::u32) % C).asType(fl::dtype::s32),
         false
     );
     auto ignoreIdx = y(0, 0).scalar<int>();
diff --git a/flashlight/fl/test/nn/NNSerializationTest.cpp b/flashlight/fl/test/nn/NNSerializationTest.cpp
index fb86f5a..88016e4 100644
--- a/flashlight/fl/test/nn/NNSerializationTest.cpp
+++ b/flashlight/fl/test/nn/NNSerializationTest.cpp
@@ -238,7 +238,7 @@ TEST(NNSerializationTest, PrettyString) {
         "(0): Conv2D (3->64, 5x5, 1, 1, 0, 0, 1, 1) (with bias)"
         "(1): Pool2D-max (3x3, 2,2, 1,1)"
         "(2): ReLU"
-        "(3): Dropout (0.400000)"
+        "(3): Dropout (0.4)"
         "(4): Linear (5->10) (without bias)"
         "(5): Tanh"
         "(6): LeakyReLU (0.200000)";
diff --git a/flashlight/fl/test/nn/NNUtilsTest.cpp b/flashlight/fl/test/nn/NNUtilsTest.cpp
index a07f731..a009881 100644
--- a/flashlight/fl/test/nn/NNUtilsTest.cpp
+++ b/flashlight/fl/test/nn/NNUtilsTest.cpp
@@ -23,7 +23,7 @@ TEST(UtilsTest, Join) {
     // Single array
     auto i = fl::rand({50, 60, 70, 1});
     auto o = join({i}, -1, 3);
-    ASSERT_TRUE(fl::all(o == i).asScalar<bool>());
+    ASSERT_TRUE(fl::all_of(o == i).asScalar<bool>());
 
     // no dim for batching adds singleton dims
     ASSERT_EQ(
@@ -40,14 +40,14 @@ TEST(UtilsTest, Join) {
     auto o1 = join({a, b, c});
     ASSERT_EQ(o1.shape(), Shape({30, 1, 300, 3}));
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o1(fl::range(25), fl::range(0, 1), fl::range(300), fl::range(0, 1))
             == a
         )
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o1(
                 fl::range(25, 29),
                 fl::range(0, 1),
@@ -58,14 +58,14 @@ TEST(UtilsTest, Join) {
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o1(fl::range(20), fl::range(0, 1), fl::range(300), fl::range(1, 2))
             == b
         )
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o1(
                 fl::range(20, 29),
                 fl::range(0, 1),
@@ -76,7 +76,7 @@ TEST(UtilsTest, Join) {
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o1(fl::range(30), fl::range(0, 1), fl::range(300), fl::range(2, 3))
             == c
         )
@@ -86,14 +86,14 @@ TEST(UtilsTest, Join) {
     auto o2 = join({a, b, c}, -1);
     ASSERT_EQ(o2.shape(), Shape({30, 1, 300, 3}));
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o2(fl::range(25), fl::range(0, 1), fl::range(300), fl::range(0, 1))
             == a
         )
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o2(
                 fl::range(25, 29),
                 fl::range(0, 1),
@@ -104,14 +104,14 @@ TEST(UtilsTest, Join) {
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o2(fl::range(20), fl::range(0, 1), fl::range(300), fl::range(1, 2))
             == b
         )
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o2(
                 fl::range(20, 29),
                 fl::range(0, 1),
@@ -122,7 +122,7 @@ TEST(UtilsTest, Join) {
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(
+        fl::all_of(
             o2(fl::range(30), fl::range(0, 1), fl::range(300), fl::range(2, 3))
             == c
         )
@@ -132,23 +132,23 @@ TEST(UtilsTest, Join) {
     auto o3 = join({a, b, c}, -1, 1);
     ASSERT_EQ(o3.shape(), Shape({30, 3, 300, 1}));
     ASSERT_TRUE(
-        fl::all(o3(fl::range(25), fl::range(0, 1), fl::range(300)) == a)
+        fl::all_of(o3(fl::range(25), fl::range(0, 1), fl::range(300)) == a)
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(o3(fl::range(25, 29), fl::range(0, 1), fl::range(300)) == -1)
+        fl::all_of(o3(fl::range(25, 29), fl::range(0, 1), fl::range(300)) == -1)
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(o3(fl::range(20), fl::range(1, 2), fl::range(300)) == b)
+        fl::all_of(o3(fl::range(20), fl::range(1, 2), fl::range(300)) == b)
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(o3(fl::range(20, 29), fl::range(1, 2), fl::range(300)) == -1)
+        fl::all_of(o3(fl::range(20, 29), fl::range(1, 2), fl::range(300)) == -1)
         .asScalar<bool>()
     );
     ASSERT_TRUE(
-        fl::all(o3(fl::range(30), fl::range(2, 3), fl::range(300)) == c)
+        fl::all_of(o3(fl::range(30), fl::range(2, 3), fl::range(300)) == c)
         .asScalar<bool>()
     );
 }
diff --git a/flashlight/fl/test/optim/OptimTest.cpp b/flashlight/fl/test/optim/OptimTest.cpp
index d7c8e1c..c7bfeca 100644
--- a/flashlight/fl/test/optim/OptimTest.cpp
+++ b/flashlight/fl/test/optim/OptimTest.cpp
@@ -19,7 +19,7 @@ TEST(OptimTest, GradNorm) {
     std::vector<Variable> parameters;
     for(int i = 0; i < 5; i++) {
         auto v = Variable(fl::randn({10, 10, 10}), true);
-        v = v.astype(fl::dtype::f64);
+        v = v.asType(fl::dtype::f64);
         v.addGrad(Variable(fl::randn({10, 10, 10}, fl::dtype::f64), false));
         parameters.push_back(v);
     }
@@ -42,7 +42,7 @@ TEST(OptimTest, GradNormF16) {
     std::vector<Variable> parameters;
     for(int i = 0; i < 5; i++) {
         auto v = Variable(fl::randn({10, 10, 10}), true);
-        v = v.astype(fl::dtype::f16);
+        v = v.asType(fl::dtype::f16);
         v.addGrad(Variable(fl::randn({10, 10, 10}, fl::dtype::f16), false));
         parameters.push_back(v);
     }
diff --git a/flashlight/fl/test/tensor/IndexTest.cpp b/flashlight/fl/test/tensor/IndexTest.cpp
index 920de06..dc87830 100644
--- a/flashlight/fl/test/tensor/IndexTest.cpp
+++ b/flashlight/fl/test/tensor/IndexTest.cpp
@@ -91,27 +91,27 @@ TEST(IndexTest, IndexAssignment) {
     t /= 7;
     ASSERT_TRUE(allClose(t, fl::full({4, 4}, 1)));
 
-    auto a = fl::full({6, 6}, 0.);
+    auto a = fl::full({6, 6}, 0.f);
     a(3, 4) = 4.;
-    ASSERT_TRUE(allClose(a(3, 4), fl::full({1}, 4.)));
+    ASSERT_TRUE(allClose(a(3, 4), fl::full({1}, 4.f)));
     a(2) = fl::full({6}, 8.);
-    ASSERT_TRUE(allClose(a(2), fl::full({6}, 8.)));
+    ASSERT_TRUE(allClose(a(2), fl::full({6}, 8.f)));
 
-    auto b = fl::full({3, 3}, 1.);
+    auto b = fl::full({3, 3}, 1.f);
     auto c = b;
     b += 1;
-    ASSERT_TRUE(allClose(b, fl::full({3, 3}, 2.)));
-    ASSERT_TRUE(allClose(c, fl::full({3, 3}, 1.)));
+    ASSERT_TRUE(allClose(b, fl::full({3, 3}, 2.f)));
+    ASSERT_TRUE(allClose(c, fl::full({3, 3}, 1.f)));
 
-    auto q = fl::full({4, 4}, 2.);
-    auto r = fl::full({4}, 3.);
+    auto q = fl::full({4, 4}, 2.f);
+    auto r = fl::full({4}, 3.f);
     q(0) = r;
     ASSERT_TRUE(allClose(q(0), r));
-    ASSERT_TRUE(allClose(q(fl::range(1, fl::end)), fl::full({3, 4}, 2.)));
+    ASSERT_TRUE(allClose(q(fl::range(1, fl::end)), fl::full({3, 4}, 2.f)));
 
     auto k = fl::rand({100, 200});
-    k(3) = fl::full({200}, 0.);
-    ASSERT_TRUE(allClose(k(3), fl::full({200}, 0.)));
+    k(3) = fl::full({200}, 0.f);
+    ASSERT_TRUE(allClose(k(3), fl::full({200}, 0.f)));
 
     // Weak ref
     auto g = fl::rand({3, 4, 5});
@@ -122,29 +122,29 @@ TEST(IndexTest, IndexAssignment) {
     ASSERT_TRUE(allClose(gC(fl::span, fl::range(0, 3)), gI));
 
     auto x = fl::rand({5, 6, 7, 8});
-    x(3) = fl::full({6, 7, 8}, 0.);
-    ASSERT_TRUE(allClose(x(3), fl::full({6, 7, 8}, 0.)));
-    x(fl::span, fl::span, 2) = fl::full({5, 6, 8}, 3.);
-    ASSERT_TRUE(allClose(x(fl::span, fl::span, 2), fl::full({5, 6, 8}, 3.)));
+    x(3) = fl::full({6, 7, 8}, 0.f);
+    ASSERT_TRUE(allClose(x(3), fl::full({6, 7, 8}, 0.f)));
+    x(fl::span, fl::span, 2) = fl::full({5, 6, 8}, 3.f);
+    ASSERT_TRUE(allClose(x(fl::span, fl::span, 2), fl::full({5, 6, 8}, 3.f)));
     ASSERT_THROW(
         x(fl::span, fl::span, 4) -= fl::rand({5, 6, 1, 8}),
         std::invalid_argument
     );
 
-    x(fl::span, fl::range(1, 3), fl::span) = fl::full({5, 2, 7, 8}, 2.);
+    x(fl::span, fl::range(1, 3), fl::span) = fl::full({5, 2, 7, 8}, 2.f);
     ASSERT_TRUE(
         allClose(
             x(fl::span, fl::range(1, 3), fl::span),
-            fl::full({5, 2, 7, 8}, 2.)
+            fl::full({5, 2, 7, 8}, 2.f)
         )
     );
 
-    x(fl::span, fl::arange({5}), fl::span, fl::arange({5})) =
-        fl::full({5, 5, 7, 5}, 2.);
+    x(fl::span, fl::arrange({5}), fl::span, fl::arrange({5})) =
+        fl::full({5, 5, 7, 5}, 2.f);
     ASSERT_TRUE(
         allClose(
             x(fl::span, fl::range(1, 3), fl::span),
-            fl::full({5, 2, 7, 8}, 2.)
+            fl::full({5, 2, 7, 8}, 2.f)
         )
     );
 }
@@ -178,7 +178,7 @@ TEST(IndexTest, flat) {
     for(unsigned i = 0; i < n.elements(); ++i)
         ASSERT_TRUE(allClose(n.flat(i), n(i % 4, (i / 4) % 6, (i / (4 * 6)) % 8)));
 
-    auto a = fl::full({5, 6, 7, 8}, 9.);
+    auto a = fl::full({5, 6, 7, 8}, 9.f);
     std::vector<int> testIndices = {0, 1, 4, 11, 62, 104, 288};
     for(const int i : testIndices)
         ASSERT_EQ(a.flat(i).scalar<float>(), 9.);
@@ -197,10 +197,10 @@ TEST(IndexTest, flat) {
 
     // Tensor assignment
     a.flat(32) = fl::full({1}, 7.4);
-    ASSERT_TRUE(allClose(a.flatten()(32), fl::full({1}, 7.4)));
+    ASSERT_TRUE(allClose(a.flatten()(32), fl::full({1}, 7.4f)));
         // In-place
         a.flat(100) += 33;
-    ASSERT_TRUE(allClose(a.flatten()(100), fl::full({1}, 33 + 9.)));
+    ASSERT_TRUE(allClose(a.flatten()(100), fl::full({1}, 33 + 9.f)));
 
         // Tensor indexing
         auto indexer = Tensor::fromVector(testIndices);
@@ -223,8 +223,8 @@ TEST(IndexTest, flat) {
         // With leading singleton dims
         auto b = fl::rand({1, 1, 10});
         ASSERT_EQ(b.flat(fl::range(3)).shape(), Shape({3}));
-    b.flat(fl::range(3)) = fl::full({3}, 6.);
-    ASSERT_TRUE(allClose(b.flatten()(fl::range(3)), fl::full({3}, 6.)));
+    b.flat(fl::range(3)) = fl::full({3}, 6.f);
+    ASSERT_TRUE(allClose(b.flatten()(fl::range(3)), fl::full({3}, 6.f)));
 }
 
 TEST(IndexTest, TensorIndex) {
@@ -239,10 +239,10 @@ TEST(IndexTest, TensorIndex) {
         ASSERT_TRUE(allClose(indexed(i), a(idxs[i])));
 
     a(indices) = 5.;
-    ASSERT_TRUE(allClose(a(indices), fl::full({size}, 5.)));
+    ASSERT_TRUE(allClose(a(indices), fl::full({size}, 5.f)));
 
     // Out of range indices
-    auto i = fl::arange({10}, 0, fl::dtype::u32);
+    auto i = fl::arrange({10}, 0, fl::dtype::u32);
     auto b = fl::rand({20, 20});
     auto ref = b;
     ASSERT_EQ(b(i).shape(), b(fl::range(10)).shape());
@@ -251,13 +251,13 @@ TEST(IndexTest, TensorIndex) {
     b(i) += 3.;
     ASSERT_TRUE(allClose(b(i), b(fl::range(10))));
     ASSERT_TRUE(allClose(b(i), (ref + 3)(i)));
-    b(i) += fl::full({(Dim) i.elements(), b.dim(1)}, 10.);
+    b(i) += fl::full({(Dim) i.elements(), b.dim(1)}, 10.f);
     ASSERT_EQ(b(i).shape(), (ref + 13)(i).shape());
     ASSERT_TRUE(allClose(b(i), (ref + 13)(i)));
 
     // Tensor index a > 1D tensor
     auto c = fl::rand({10, 10, 10});
-    ASSERT_EQ(c(fl::arange({5})).shape(), Shape({5, 10, 10}));
+    ASSERT_EQ(c(fl::arrange({5})).shape(), Shape({5, 10, 10}));
 }
 
 TEST(IndexTest, ExpressionIndex) {
diff --git a/flashlight/fl/test/tensor/TensorBLASTest.cpp b/flashlight/fl/test/tensor/TensorBLASTest.cpp
index df14ba8..e30729b 100644
--- a/flashlight/fl/test/tensor/TensorBLASTest.cpp
+++ b/flashlight/fl/test/tensor/TensorBLASTest.cpp
@@ -18,17 +18,17 @@ TEST(TensorBLASTest, matmul) {
     // TODO: test tensors with order > 2
 
     // Reference impl
-    auto matmulRef = [](const Tensor& lhs, const Tensor& rhs) {
+    auto matmulRef = [](Tensor const& lhs, Tensor const& rhs) {
             // (M x N) x (N x K) --> (M x K)
             int M = lhs.dim(0);
             int N = lhs.dim(1);
             int K = rhs.dim(1);
 
-            auto out = fl::full({M, K}, 0.);
+            auto out = fl::full({M, K}, 0.f);
 
-            for(unsigned i = 0; i < M; ++i)
-                for(unsigned j = 0; j < K; ++j)
-                    for(unsigned k = 0; k < N; ++k)
+            for(int i = 0; i < M; ++i)
+                for(int j = 0; j < K; ++j)
+                    for(int k = 0; k < N; ++k)
                         out(i, j) += lhs(i, k) * rhs(k, j);
             return out;
         };
diff --git a/flashlight/fl/test/tensor/TensorBaseTest.cpp b/flashlight/fl/test/tensor/TensorBaseTest.cpp
index 2a7e1b1..ec3bcdd 100644
--- a/flashlight/fl/test/tensor/TensorBaseTest.cpp
+++ b/flashlight/fl/test/tensor/TensorBaseTest.cpp
@@ -144,8 +144,8 @@ TEST(TensorBaseTest, fromScalar) {
 TEST(TensorBaseTest, string) {
     // Different backends might print tensors differently - check for consistency
     // across two identical tensors
-    auto a = fl::full({3, 4, 5}, 6.);
-    auto b = fl::full({3, 4, 5}, 6.);
+    auto a = fl::full({3, 4, 5}, 6.f);
+    auto b = fl::full({3, 4, 5}, 6.f);
     ASSERT_EQ(a.toString(), b.toString());
 
     std::stringstream ssa, ssb;
@@ -155,43 +155,43 @@ TEST(TensorBaseTest, string) {
 }
 
 TEST(TensorBaseTest, AssignmentOperators) {
-    auto a = fl::full({3, 3}, 1.);
+    auto a = fl::full({3, 3}, 1.f);
     a += 2;
-    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 3.)));
+    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 3.f)));
     a -= 1;
-    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 2.)));
+    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 2.f)));
     a *= 8;
-    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 16.)));
+    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 16.f)));
     a /= 4;
-    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 4.)));
+    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 4.f)));
 
-    a = fl::full({4, 4}, 7.);
-    ASSERT_TRUE(allClose(a, fl::full({4, 4}, 7.)));
+    a = fl::full({4, 4}, 7.f);
+    ASSERT_TRUE(allClose(a, fl::full({4, 4}, 7.f)));
     auto b = a;
-    ASSERT_TRUE(allClose(b, fl::full({4, 4}, 7.)));
+    ASSERT_TRUE(allClose(b, fl::full({4, 4}, 7.f)));
     a = 6.;
-    ASSERT_TRUE(allClose(a, fl::full({4, 4}, 6.)));
+    ASSERT_TRUE(allClose(a, fl::full({4, 4}, 6.f)));
 
-    a = fl::full({5, 6, 7}, 8.);
-    ASSERT_TRUE(allClose(a, fl::full({5, 6, 7}, 8.)));
+    a = fl::full({5, 6, 7}, 8.f);
+    ASSERT_TRUE(allClose(a, fl::full({5, 6, 7}, 8.f)));
 }
 
 TEST(TensorBaseTest, CopyOperators) {
-    auto a = fl::full({3, 3}, 1.);
+    auto a = fl::full({3, 3}, 1.f);
     auto b = a;
     a += 1;
-    ASSERT_TRUE(allClose(b, fl::full({3, 3}, 1.)));
-    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 2.)));
+    ASSERT_TRUE(allClose(b, fl::full({3, 3}, 1.f)));
+    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 2.f)));
 
     auto c = a.copy();
     a += 1;
-    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 3.)));
-    ASSERT_TRUE(allClose(c, fl::full({3, 3}, 2.)));
+    ASSERT_TRUE(allClose(a, fl::full({3, 3}, 3.f)));
+    ASSERT_TRUE(allClose(c, fl::full({3, 3}, 2.f)));
 }
 
 TEST(TensorBaseTest, ConstructFromData) {
     // Tensor::fromVector
-    float val = 3.;
+    float val = 3.f;
     std::vector<float> vec(100, val);
     fl::Shape s = {10, 10};
     ASSERT_TRUE(allClose(fl::Tensor::fromVector(s, vec), fl::full(s, val)));
@@ -240,7 +240,7 @@ TEST(TensorBaseTest, ConstructFromData) {
 }
 
 TEST(TensorBaseTest, reshape) {
-    auto a = fl::full({4, 4}, 3.);
+    auto a = fl::full({4, 4}, 3.f);
     auto b = fl::reshape(a, Shape({8, 2}));
     ASSERT_EQ(b.shape(), Shape({8, 2}));
     ASSERT_TRUE(allClose(a, fl::reshape(b, {4, 4})));
@@ -251,12 +251,12 @@ TEST(TensorBaseTest, reshape) {
 TEST(TensorBaseTest, transpose) {
     // TODO: expand to check els
     ASSERT_TRUE(
-        allClose(fl::transpose(fl::full({3, 4}, 3.)), fl::full({4, 3}, 3.))
+        allClose(fl::transpose(fl::full({3, 4}, 3.f)), fl::full({4, 3}, 3.f))
     );
     ASSERT_TRUE(
         allClose(
-            fl::transpose(fl::full({4, 5, 6, 7}, 3.), {2, 0, 1, 3}),
-            fl::full({6, 4, 5, 7}, 3.)
+            fl::transpose(fl::full({4, 5, 6, 7}, 3.f), {2, 0, 1, 3}),
+            fl::full({6, 4, 5, 7}, 3.f)
         )
     );
     ASSERT_THROW(fl::transpose(fl::rand({3, 4, 5}), {0, 1}), std::exception);
@@ -282,10 +282,10 @@ TEST(TensorBaseTest, transpose) {
 }
 
 TEST(TensorBaseTest, tile) {
-    auto a = fl::full({4, 4}, 3.);
+    auto a = fl::full({4, 4}, 3.f);
     auto tiled = fl::tile(a, {2, 2});
     ASSERT_EQ(tiled.shape(), Shape({8, 8}));
-    ASSERT_TRUE(allClose(tiled, fl::full({8, 8}, 3.)));
+    ASSERT_TRUE(allClose(tiled, fl::full({8, 8}, 3.f)));
     ASSERT_EQ(fl::tile(a, {}).shape(), a.shape());
 
     auto s = fl::fromScalar(3.14);
@@ -294,9 +294,9 @@ TEST(TensorBaseTest, tile) {
 }
 
 TEST(TensorBaseTest, concatenate) {
-    auto a = fl::full({3, 3}, 1.);
-    auto b = fl::full({3, 3}, 2.);
-    auto c = fl::full({3, 3}, 3.);
+    auto a = fl::full({3, 3}, 1.f);
+    auto b = fl::full({3, 3}, 2.f);
+    auto c = fl::full({3, 3}, 3.f);
     ASSERT_TRUE(
         allClose(fl::concatenate(0, a, b, c), fl::concatenate({a, b, c}))
     );
@@ -346,10 +346,10 @@ TEST(TensorBaseTest, nonzero) {
 
 TEST(TensorBaseTest, flatten) {
     unsigned s = 6;
-    auto a = fl::full({s, s, s}, 2.);
+    auto a = fl::full({s, s, s}, 2.f);
     auto flat = a.flatten();
     ASSERT_EQ(flat.shape(), Shape({s * s * s}));
-    ASSERT_TRUE(allClose(flat, fl::full({s * s * s}, 2.)));
+    ASSERT_TRUE(allClose(flat, fl::full({s * s * s}, 2.f)));
 }
 
 TEST(TensorBaseTest, pad) {
@@ -357,9 +357,9 @@ TEST(TensorBaseTest, pad) {
     auto zeroPadded = fl::pad(t, {{1, 2}, {3, 4}});
     auto zeroTest = fl::concatenate(
         1,
-        fl::full({8, 3}, 0.),
-        fl::concatenate(0, fl::full({1, 2}, 0.), t, fl::full({2, 2}, 0.)),
-        fl::full({8, 4}, 0.)
+        fl::full({8, 3}, 0.f),
+        fl::concatenate(0, fl::full({1, 2}, 0.f), t, fl::full({2, 2}, 0.f)),
+        fl::full({8, 4}, 0.f)
     );
     ASSERT_TRUE(allClose(zeroPadded, zeroTest));
 
@@ -403,7 +403,7 @@ TEST(TensorBaseTest, pad) {
 TEST(TensorBaseTest, astype) {
     auto a = fl::rand({3, 3});
     ASSERT_EQ(a.type(), dtype::f32);
-    ASSERT_EQ(a.astype(dtype::f64).type(), dtype::f64);
+    ASSERT_EQ(a.asType(dtype::f64).type(), dtype::f64);
 }
 
 TEST(TensorBaseTest, where) {
@@ -420,13 +420,13 @@ TEST(TensorBaseTest, where) {
 
     // non b8-type vector throws
     EXPECT_THROW(
-        fl::where((a < 5).astype(fl::dtype::f32), a, a * 10),
+        fl::where((a < 5).asType(fl::dtype::f32), a, a * 10),
         std::exception
     );
 }
 
 TEST(TensorBaseTest, topk) {
-    auto a = fl::arange({10, 2});
+    auto a = fl::arrange({10, 2});
     Tensor values;
     Tensor indices;
     fl::topk(values, indices, a, /* k = */ 3, /* axis = */ 0); // descending sort
@@ -452,7 +452,7 @@ TEST(TensorBaseTest, topk) {
 
 TEST(TensorBaseTest, sort) {
     Shape dims({10, 2});
-    auto a = fl::arange(dims);
+    auto a = fl::arrange(dims);
     auto sorted = fl::sort(a, /* axis = */ 0, SortMode::Descending);
 
     Tensor expected({dims[0]}, a.type());
@@ -476,7 +476,7 @@ TEST(TensorBaseTest, sort) {
 
 TEST(TensorBaseTest, argsort) {
     Shape dims({10, 2});
-    auto a = fl::arange(dims);
+    auto a = fl::arrange(dims);
     auto sorted = fl::argsort(a, /* axis = */ 0, SortMode::Descending);
 
     Tensor expected({dims[0]}, fl::dtype::u32);
@@ -512,7 +512,7 @@ void assertScalarBehavior(fl::dtype type) {
             << "dtype: " << type
             << ", ScalarArgType: " << dtype_traits<ScalarArgType>::getName();
 
-    ScalarArgType val = static_cast<ScalarArgType>(rand());
+    auto val = static_cast<ScalarArgType>(rand());
     auto a = fl::full({5, 6}, val, type);
 
     ASSERT_TRUE(allClose(fl::full({1}, a.template scalar<ScalarArgType>(), type), a(0, 0)))
@@ -616,47 +616,47 @@ TEST(TensorBaseTest, toHostVector) {
 TEST(TensorBaseTest, arange) {
     // Range/step overload
     ASSERT_TRUE(
-        allClose(fl::arange(2, 10, 2), Tensor::fromVector<int>({2, 4, 6, 8}))
+        allClose(fl::arrange(2, 10, 2), Tensor::fromVector<int>({2, 4, 6, 8}))
     );
     ASSERT_TRUE(
-        allClose(fl::arange(0, 6), Tensor::fromVector<int>({0, 1, 2, 3, 4, 5}))
+        allClose(fl::arrange(0, 6), Tensor::fromVector<int>({0, 1, 2, 3, 4, 5}))
     );
     ASSERT_TRUE(
         allClose(
-            fl::arange(0., 1.22, 0.25),
-            Tensor::fromVector<float>({0., 0.25, 0.5, 0.75})
+            fl::arrange(0.f, 1.22f, 0.25f),
+            Tensor::fromVector<float>({0.f, 0.25f, 0.5f, 0.75f})
         )
     );
     ASSERT_TRUE(
         allClose(
-            fl::arange(0., 4.1),
-            Tensor::fromVector<float>({0., 1., 2., 3.})
+            fl::arrange(0.f, 4.1f),
+            Tensor::fromVector<float>({0.f, 1.f, 2.f, 3.f})
         )
     );
 
     // Shape overload
-    auto v = Tensor::fromVector<float>({0., 1., 2., 3.});
-    ASSERT_TRUE(allClose(fl::arange({4}), v));
+    auto v = Tensor::fromVector<float>({0.f, 1.f, 2.f, 3.f});
+    ASSERT_TRUE(allClose(fl::arrange({4}), v));
 
-    ASSERT_TRUE(allClose(fl::arange({4, 5}), fl::tile(v, {1, 5})));
-    ASSERT_EQ(fl::arange({4, 5}, 1).shape(), Shape({4, 5}));
+    ASSERT_TRUE(allClose(fl::arrange({4, 5}), fl::tile(v, {1, 5})));
+    ASSERT_EQ(fl::arrange({4, 5}, 1).shape(), Shape({4, 5}));
     ASSERT_TRUE(
         allClose(
-            fl::arange({4, 5}, 1),
+            fl::arrange({4, 5}, 1),
             fl::tile(
-                fl::reshape(Tensor::fromVector<float>({0., 1., 2., 3., 4.}), {1, 5}),
+                fl::reshape(Tensor::fromVector<float>({0.f, 1.f, 2.f, 3.f, 4.f}), {1, 5}),
                 {4}
             )
         )
     );
-    ASSERT_EQ(fl::arange({2, 6}, 0, fl::dtype::f64).type(), fl::dtype::f64);
+    ASSERT_EQ(fl::arrange({2, 6}, 0, fl::dtype::f64).type(), fl::dtype::f64);
 }
 
 TEST(TensorBaseTest, iota) {
     ASSERT_TRUE(
         allClose(
             fl::iota({5, 3}, {1, 2}),
-            fl::tile(fl::reshape(fl::arange({15}), {5, 3}), {1, 2})
+            fl::tile(fl::reshape(fl::arrange({15}), {5, 3}), {1, 2})
         )
     );
     ASSERT_EQ(fl::iota({2, 2}, {2, 2}, fl::dtype::f64).type(), fl::dtype::f64);
diff --git a/flashlight/fl/test/tensor/TensorBinaryOpsTest.cpp b/flashlight/fl/test/tensor/TensorBinaryOpsTest.cpp
index 14cfe15..1aa85af 100644
--- a/flashlight/fl/test/tensor/TensorBinaryOpsTest.cpp
+++ b/flashlight/fl/test/tensor/TensorBinaryOpsTest.cpp
@@ -28,7 +28,7 @@ void assertTensorScalarBinop(
     const Tensor& expectOut
 ) {
     auto result = op(in, scalar);
-    auto expect = expectOut.astype(result.type());
+    auto expect = expectOut.asType(result.type());
     ASSERT_TRUE(allClose(result, expect))
         << "in.type(): " << in.type()
         << ", ScalarType: " << dtype_traits<ScalarType>::getName();
@@ -42,7 +42,7 @@ void assertScalarTensorBinop(
     const Tensor& expectOut
 ) {
     auto result = op(scalar, in);
-    auto expect = expectOut.astype(result.type());
+    auto expect = expectOut.asType(result.type());
     ASSERT_TRUE(allClose(result, expect))
         << "ScalarType: " << dtype_traits<ScalarType>::getName()
         << ", in.type(): " << in.type();
@@ -98,12 +98,12 @@ void applyToAllDtypes(std::function<void(fl::dtype)> func) {
 
 TEST(TensorBinaryOpsTest, ArithmeticBinaryOperators) {
     auto testArithmeticBinops = [](dtype type) {
-            auto a = Tensor::fromVector<float>({2, 2}, {0, 1, 2, 3}).astype(type);
-            auto b = Tensor::fromVector<float>({2, 2}, {1, 2, 3, 4}).astype(type);
-            auto c = Tensor::fromVector<float>({2, 2}, {1, 3, 5, 7}).astype(type);
-            auto d = Tensor::fromVector<float>({2, 2}, {1, 6, 15, 28}).astype(type);
-            auto e = Tensor::fromVector<float>({2, 2}, {3, 2, 1, 0}).astype(type);
-            auto f = Tensor::fromVector<float>({2, 2}, {2, 4, 6, 8}).astype(type);
+            auto a = Tensor::fromVector<float>({2, 2}, {0, 1, 2, 3}).asType(type);
+            auto b = Tensor::fromVector<float>({2, 2}, {1, 2, 3, 4}).asType(type);
+            auto c = Tensor::fromVector<float>({2, 2}, {1, 3, 5, 7}).asType(type);
+            auto d = Tensor::fromVector<float>({2, 2}, {1, 6, 15, 28}).asType(type);
+            auto e = Tensor::fromVector<float>({2, 2}, {3, 2, 1, 0}).asType(type);
+            auto f = Tensor::fromVector<float>({2, 2}, {2, 4, 6, 8}).asType(type);
             auto z = fl::full({2, 2}, 0, type);
 
             assertCommutativeBinop(a, z, std::plus<>(), a);
@@ -140,16 +140,16 @@ TEST(TensorBinaryOpsTest, ComparisonBinaryOperators) {
     auto falses = fl::full({2, 2}, 0, dtype::b8);
     auto trues = fl::full({2, 2}, 1, dtype::b8);
     auto falseTrues =
-        Tensor::fromVector<float>({2, 2}, {0, 1, 0, 1}).astype(fl::dtype::b8);
+        Tensor::fromVector<float>({2, 2}, {0, 1, 0, 1}).asType(fl::dtype::b8);
     auto trueFalses =
-        Tensor::fromVector<float>({2, 2}, {1, 0, 1, 0}).astype(fl::dtype::b8);
+        Tensor::fromVector<float>({2, 2}, {1, 0, 1, 0}).asType(fl::dtype::b8);
 
     auto testComparisonBinops = [&](dtype type) {
-            auto a = Tensor::fromVector<float>({2, 2}, {0, 1, 2, 3}).astype(type);
-            auto b = Tensor::fromVector<float>({2, 2}, {0, 0, 2, 0}).astype(type);
-            auto c = Tensor::fromVector<float>({2, 2}, {2, 3, 4, 5}).astype(type);
-            auto d = Tensor::fromVector<float>({2, 2}, {0, 4, 2, 6}).astype(type);
-            auto e = Tensor::fromVector<float>({2, 2}, {0, 1, 0, 1}).astype(type);
+            auto a = Tensor::fromVector<float>({2, 2}, {0, 1, 2, 3}).asType(type);
+            auto b = Tensor::fromVector<float>({2, 2}, {0, 0, 2, 0}).asType(type);
+            auto c = Tensor::fromVector<float>({2, 2}, {2, 3, 4, 5}).asType(type);
+            auto d = Tensor::fromVector<float>({2, 2}, {0, 4, 2, 6}).asType(type);
+            auto e = Tensor::fromVector<float>({2, 2}, {0, 1, 0, 1}).asType(type);
 
             ASSERT_TRUE(allClose((a == a), trues)) << "dtype: " << type;
             assertCommutativeBinop(a, b, std::equal_to<>(), trueFalses);
@@ -207,11 +207,11 @@ TEST(TensorBinaryOpsTest, LogicalBinaryOperators) {
     auto falses = fl::full({2, 2}, 0, dtype::b8);
     auto trues = fl::full({2, 2}, 1, dtype::b8);
     auto falseTrues =
-        Tensor::fromVector<float>({2, 2}, {0, 1, 0, 1}).astype(fl::dtype::b8);
+        Tensor::fromVector<float>({2, 2}, {0, 1, 0, 1}).asType(fl::dtype::b8);
 
     auto testLogicalBinops = [&](dtype type) {
-            auto a = Tensor::fromVector<float>({2, 2}, {0, 1, 0, 3}).astype(type);
-            auto b = Tensor::fromVector<float>({2, 2}, {2, 3, 4, 5}).astype(type);
+            auto a = Tensor::fromVector<float>({2, 2}, {0, 1, 0, 3}).asType(type);
+            auto b = Tensor::fromVector<float>({2, 2}, {2, 3, 4, 5}).asType(type);
             auto z = fl::full({2, 2}, 0, type);
 
             ASSERT_TRUE(allClose((z || z), falses)) << "dtype: " << type;
@@ -234,9 +234,9 @@ TEST(TensorBinaryOpsTest, LogicalBinaryOperators) {
 
 TEST(TensorBinaryOpsTest, ModuloBinaryOperators) {
     auto testModuloBinop = [](dtype type) {
-            auto a = Tensor::fromVector<float>({2, 2}, {1, 2, 3, 4}).astype(type);
-            auto b = Tensor::fromVector<float>({2, 2}, {2, 3, 5, 7}).astype(type);
-            auto c = Tensor::fromVector<float>({2, 2}, {0, 1, 2, 3}).astype(type);
+            auto a = Tensor::fromVector<float>({2, 2}, {1, 2, 3, 4}).asType(type);
+            auto b = Tensor::fromVector<float>({2, 2}, {2, 3, 5, 7}).asType(type);
+            auto c = Tensor::fromVector<float>({2, 2}, {0, 1, 2, 3}).asType(type);
             auto z = fl::full({2, 2}, 0, type);
 
             ASSERT_TRUE(allClose((z % b), z)) << "dtype: " << type;
@@ -260,14 +260,14 @@ TEST(TensorBinaryOpsTest, ModuloBinaryOperators) {
 
 TEST(TensorBinaryOpsTest, BitBinaryOperators) {
     auto testBitBinops = [](dtype type) {
-            auto a = Tensor::fromVector<float>({2, 1}, {0b0001, 0b1000}).astype(type);
-            auto b = Tensor::fromVector<float>({2, 1}, {0b0010, 0b0100}).astype(type);
-            auto c = Tensor::fromVector<float>({2, 1}, {0b0011, 0b1100}).astype(type);
-            auto d = Tensor::fromVector<float>({2, 1}, {0b0110, 0b0110}).astype(type);
-            auto e = Tensor::fromVector<float>({2, 1}, {0b1000, 0b0001}).astype(type);
-            auto g = Tensor::fromVector<float>({2, 1}, {2, 1}).astype(type);
-            auto h = Tensor::fromVector<float>({2, 1}, {0b1000, 0b1000}).astype(type);
-            auto z = Tensor::fromVector<float>({2, 1}, {0b0000, 0b0000}).astype(type);
+            auto a = Tensor::fromVector<float>({2, 1}, {0b0001, 0b1000}).asType(type);
+            auto b = Tensor::fromVector<float>({2, 1}, {0b0010, 0b0100}).asType(type);
+            auto c = Tensor::fromVector<float>({2, 1}, {0b0011, 0b1100}).asType(type);
+            auto d = Tensor::fromVector<float>({2, 1}, {0b0110, 0b0110}).asType(type);
+            auto e = Tensor::fromVector<float>({2, 1}, {0b1000, 0b0001}).asType(type);
+            auto g = Tensor::fromVector<float>({2, 1}, {2, 1}).asType(type);
+            auto h = Tensor::fromVector<float>({2, 1}, {0b1000, 0b1000}).asType(type);
+            auto z = Tensor::fromVector<float>({2, 1}, {0b0000, 0b0000}).asType(type);
 
             ASSERT_TRUE(allClose((z & z), z)) << "dtype: " << type;
             assertCommutativeBinop(a, b, std::bit_and<>(), z);
@@ -366,8 +366,8 @@ TEST(TensorBinaryOpsTest, minimum) {
     auto c = fl::minimum(a, b);
     ASSERT_EQ(a.type(), c.type());
     ASSERT_TRUE(allClose(a, c));
-    ASSERT_TRUE(allClose(fl::minimum(1, b).astype(a.type()), a));
-    ASSERT_TRUE(allClose(fl::minimum(b, 1).astype(a.type()), a));
+    ASSERT_TRUE(allClose(fl::minimum(1, b).asType(a.type()), a));
+    ASSERT_TRUE(allClose(fl::minimum(b, 1).asType(a.type()), a));
 }
 
 TEST(TensorBinaryOpsTest, maximum) {
@@ -376,8 +376,8 @@ TEST(TensorBinaryOpsTest, maximum) {
     auto c = fl::maximum(a, b);
     ASSERT_EQ(b.type(), c.type());
     ASSERT_TRUE(allClose(b, c));
-    ASSERT_TRUE(allClose(fl::maximum(1, b).astype(a.type()), b));
-    ASSERT_TRUE(allClose(fl::maximum(b, 1).astype(a.type()), b));
+    ASSERT_TRUE(allClose(fl::maximum(1, b).asType(a.type()), b));
+    ASSERT_TRUE(allClose(fl::maximum(b, 1).asType(a.type()), b));
 }
 
 using binaryOpFunc_t = Tensor (*)(const Tensor& lhs, const Tensor& rhs);
@@ -473,8 +473,8 @@ TEST(TensorBinaryOpsTest, broadcasting) {
 
     for(const auto& funcp : functions) {
         for(auto& shapeData : shapes) {
-            auto lhs = ((fl::rand(shapeData.lhs) + 1) * 10).astype(fl::dtype::s32);
-            auto rhs = ((fl::rand(shapeData.rhs) + 1) * 10).astype(fl::dtype::s32);
+            auto lhs = ((fl::rand(shapeData.lhs) + 1) * 10).asType(fl::dtype::s32);
+            auto rhs = ((fl::rand(shapeData.rhs) + 1) * 10).asType(fl::dtype::s32);
 
             auto [actualOut, expectedOut] = doBinaryOp(
                 lhs,
@@ -498,7 +498,7 @@ TEST(TensorBinaryOpsTest, broadcasting) {
         // Scalar broadcasting
         const double scalarVal = 4;
         const Shape inShape = {2, 3, 4};
-        const auto lhs = fl::rand(inShape).astype(fl::dtype::s32);
+        const auto lhs = fl::rand(inShape).asType(fl::dtype::s32);
         const auto rhs = fl::fromScalar(scalarVal, fl::dtype::s32);
         const auto rhsTiled = fl::full(inShape, scalarVal, fl::dtype::s32);
         ASSERT_TRUE(allClose(funcp.first(lhs, rhs), funcp.first(lhs, rhsTiled)));
@@ -517,7 +517,7 @@ TEST(TensorBinaryOpsTest, powerDouble) {
 
     auto b = fl::full({3, 3}, 2.);
     ASSERT_TRUE(
-        allClose(fl::power(3, a), fl::full(b.shape(), 3 * 3, fl::dtype::f32))
+        allClose(fl::power(3, a), fl::full(b.shape(), 3 * 3, fl::dtype::f64))
     );
 }
 
diff --git a/flashlight/fl/test/tensor/TensorReductionTest.cpp b/flashlight/fl/test/tensor/TensorReductionTest.cpp
index 320356b..c56941c 100644
--- a/flashlight/fl/test/tensor/TensorReductionTest.cpp
+++ b/flashlight/fl/test/tensor/TensorReductionTest.cpp
@@ -190,18 +190,18 @@ TEST(TensorReductionTest, max) {
 
 TEST(TensorReductionTest, cumsum) {
     int max = 30;
-    auto a = fl::tile(fl::arange(1, max), {1, 2});
+    auto a = fl::tile(fl::arrange(1, max), {1, 2});
 
-    auto ref = fl::arange(1, max);
+    auto ref = fl::arrange(1, max);
     for(int i = 1; i < max - 1; ++i)
-        ref += fl::concatenate({fl::full({i}, 0), fl::arange(1, max - i)});
+        ref += fl::concatenate({fl::full({i}, 0), fl::arrange(1, max - i)});
 
     ASSERT_TRUE(allClose(fl::cumsum(a, 0), fl::tile(ref, {1, 2})));
     ASSERT_TRUE(
         allClose(
             fl::cumsum(a, 1),
             fl::concatenate(
-                {fl::arange(1, max), 2 * fl::arange(1, max)}, /* axis = */
+                {fl::arrange(1, max), 2 * fl::arrange(1, max)}, /* axis = */
                 1
             )
         )
@@ -209,8 +209,8 @@ TEST(TensorReductionTest, cumsum) {
 }
 
 TEST(TensorReductionTest, sum) {
-    auto t = fl::full({3, 4, 5, 6}, 1.0);
-    ASSERT_TRUE(allClose(fl::sum(t, {0}), fl::full({4, 5, 6}, 3.0)));
+    auto t = fl::full({3, 4, 5, 6}, 1.f);
+    ASSERT_TRUE(allClose(fl::sum(t, {0}), fl::full({4, 5, 6}, 3.f)));
     ASSERT_TRUE(
         allClose(fl::sum(t, {1, 2}), fl::full({3, 6}, 4 * 5, fl::dtype::f32))
     );
@@ -247,12 +247,12 @@ TEST(TensorReductionTest, mean) {
     );
 
     auto s = fl::full({5, 6, 7}, 1);
-    ASSERT_TRUE(allClose(fl::mean(s, {0}), fl::full({6, 7}, 1.)));
+    ASSERT_TRUE(allClose(fl::mean(s, {0}), fl::full({6, 7}, 1.f)));
 
     auto a = fl::mean(fl::full({5, 5, 5, 5}, 1));
     ASSERT_EQ(a.shape(), Shape({}));
     ASSERT_EQ(a.elements(), 1);
-    ASSERT_EQ(a.scalar<float>(), 1.);
+    ASSERT_EQ(a.scalar<float>(), 1.f);
 
     // TODO: fixture this
     const float v = 3.14;
@@ -264,8 +264,8 @@ TEST(TensorReductionTest, mean) {
 
 TEST(TensorReductionTest, median) {
     auto a = Tensor::fromVector<int>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-    ASSERT_EQ(fl::median(a).scalar<float>(), 4.5);
-    ASSERT_TRUE(allClose(fl::median(a, {0}), fl::fromScalar(4.5)));
+    ASSERT_EQ(fl::median(a).scalar<float>(), 4.5f);
+    ASSERT_TRUE(allClose(fl::median(a, {0}), fl::fromScalar(4.5f)));
     ASSERT_EQ(fl::median(fl::rand({5, 6, 7, 8}), {1, 2}).shape(), Shape({5, 8}));
     ASSERT_EQ(
         fl::median(fl::rand({5, 6, 7, 8}), {1, 2}, /* keepDims = */ true).shape(),
@@ -275,7 +275,7 @@ TEST(TensorReductionTest, median) {
     auto b = fl::median(fl::full({5, 5, 5, 5}, 1));
     ASSERT_EQ(b.shape(), Shape({}));
     ASSERT_EQ(b.elements(), 1);
-    ASSERT_EQ(b.scalar<float>(), 1.);
+    ASSERT_EQ(b.scalar<float>(), 1.f);
 
     const float v = 3.14;
     auto q = fl::median(fl::fromScalar(v));
@@ -297,7 +297,7 @@ TEST(TensorReductionTest, var) {
     );
 
     auto s = fl::full({5, 6, 7}, 1);
-    ASSERT_TRUE(allClose(fl::var(s, {0}), fl::full({6, 7}, 0.)));
+    ASSERT_TRUE(allClose(fl::var(s, {0}), fl::full({6, 7}, 0.f)));
     auto a = fl::rand({5, 5});
     ASSERT_TRUE(allClose(fl::var(a), fl::var(a, {0, 1})));
 
@@ -310,14 +310,14 @@ TEST(TensorReductionTest, var) {
 
 TEST(TensorReductionTest, std) {
     auto r = fl::rand({7, 8, 9});
-    ASSERT_NEAR(fl::std(r).scalar<float>(), 0.2886, 0.005);
+    ASSERT_NEAR(fl::std(r).scalar<float>(), 0.2886f, 0.005f);
     ASSERT_EQ(
         fl::std(r, {0, 1}, /* keepDims = */ true).shape(),
         Shape({1, 1, 9})
     );
 
     auto s = fl::full({5, 6, 7}, 1);
-    ASSERT_TRUE(allClose(fl::std(s, {0}), fl::full({6, 7}, 0.)));
+    ASSERT_TRUE(allClose(fl::std(s, {0}), fl::full({6, 7}, 0.f)));
     ASSERT_TRUE(allClose(fl::std(s, {1}), fl::sqrt(fl::var(s, {1}))));
 
     const float v = 3.14;
@@ -334,7 +334,7 @@ TEST(TensorReductionTest, norm) {
     ASSERT_EQ(normAll.shape(), Shape({}));
     ASSERT_EQ(normAll.elements(), 1);
     ASSERT_FLOAT_EQ(
-        fl::norm(fl::full({5, 5}, 1.)).scalar<float>(),
+        fl::norm(fl::full({5, 5}, 1.f)).scalar<float>(),
         std::sqrt(5 * 5)
     );
     ASSERT_EQ(
@@ -347,78 +347,78 @@ TEST(TensorReductionTest, norm) {
     const float v = 3.14;
     auto q = fl::norm(fl::fromScalar(v));
     ASSERT_EQ(q.shape(), Shape());
-    ASSERT_NEAR(q.scalar<float>(), 3.14, 1e-4);
+    ASSERT_NEAR(q.scalar<float>(), 3.14f, 1e-4);
     ASSERT_EQ(fl::norm(fl::fromScalar(v), {0}).shape(), Shape());
 }
 
 TEST(TensorReductionTest, any) {
     using fl::dtype;
     auto t = Tensor::fromVector<unsigned>({3, 3}, {1, 0, 0, 0, 0, 0, 0, 0, 1});
-    auto anyAll = fl::any(t);
+    auto anyAll = fl::any_of(t);
     ASSERT_EQ(anyAll.shape(), Shape({}));
     ASSERT_EQ(anyAll.elements(), 1);
     ASSERT_TRUE(anyAll.scalar<char>());
     ASSERT_TRUE(
         allClose(
-            fl::any(t, {0}),
-            Tensor::fromVector<unsigned>({1, 0, 1}).astype(dtype::b8)
+            fl::any_of(t, {0}),
+            Tensor::fromVector<unsigned>({1, 0, 1}).asType(dtype::b8)
         )
     );
-    ASSERT_TRUE(allClose(fl::any(t, {0, 1}), fl::fromScalar(true, dtype::b8)));
-    ASSERT_FALSE(fl::any(Tensor::fromVector<unsigned>({0, 0, 0})).scalar<char>());
+    ASSERT_TRUE(allClose(fl::any_of(t, {0, 1}), fl::fromScalar(true, dtype::b8)));
+    ASSERT_FALSE(fl::any_of(Tensor::fromVector<unsigned>({0, 0, 0})).scalar<char>());
 
-    auto keptDims = fl::any(
-        fl::any(t, {1}, /* keepDims = */ true),
+    auto keptDims = fl::any_of(
+        fl::any_of(t, {1}, /* keepDims = */ true),
         {0}, /* keepDims = */
         true
     );
     ASSERT_EQ(keptDims.shape(), Shape({1, 1}));
-    ASSERT_EQ(keptDims.scalar<char>(), fl::any(t, {0, 1}).scalar<char>());
-    auto q = fl::any(fl::full({5, 5, 5, 5}, 1));
+    ASSERT_EQ(keptDims.scalar<char>(), fl::any_of(t, {0, 1}).scalar<char>());
+    auto q = fl::any_of(fl::full({5, 5, 5, 5}, 1));
     ASSERT_EQ(q.shape(), Shape({}));
     ASSERT_EQ(q.elements(), 1);
     ASSERT_EQ(q.scalar<char>(), true);
 
     const float v = 3.14;
-    auto r = fl::any(fl::fromScalar(v));
+    auto r = fl::any_of(fl::fromScalar(v));
     ASSERT_EQ(r.shape(), Shape());
     ASSERT_TRUE(r.scalar<char>());
-    ASSERT_EQ(fl::any(fl::fromScalar(v), {0}).shape(), Shape());
+    ASSERT_EQ(fl::any_of(fl::fromScalar(v), {0}).shape(), Shape());
 }
 
 TEST(TensorReductionTest, all) {
     using fl::dtype;
     auto t = Tensor::fromVector<unsigned>({3, 3}, {1, 0, 0, 0, 0, 0, 0, 0, 1});
-    auto allAll = fl::all(t);
+    auto allAll = fl::all_of(t);
     ASSERT_EQ(allAll.shape(), Shape({}));
     ASSERT_EQ(allAll.elements(), 1);
     ASSERT_FALSE(allAll.scalar<char>());
     ASSERT_TRUE(
         allClose(
-            fl::all(t, {0}),
-            Tensor::fromVector<unsigned>({0, 0, 0}).astype(dtype::b8)
+            fl::all_of(t, {0}),
+            Tensor::fromVector<unsigned>({0, 0, 0}).asType(dtype::b8)
         )
     );
-    ASSERT_TRUE(allClose(fl::all(t, {0, 1}), fl::fromScalar(false, dtype::b8)));
-    ASSERT_TRUE(fl::all(Tensor::fromVector<unsigned>({1, 1, 1})).scalar<char>());
+    ASSERT_TRUE(allClose(fl::all_of(t, {0, 1}), fl::fromScalar(false, dtype::b8)));
+    ASSERT_TRUE(fl::all_of(Tensor::fromVector<unsigned>({1, 1, 1})).scalar<char>());
 
-    auto keptDims = fl::all(
-        fl::all(t, {1}, /* keepDims = */ true),
+    auto keptDims = fl::all_of(
+        fl::all_of(t, {1}, /* keepDims = */ true),
         {0}, /* keepDims = */
         true
     );
     ASSERT_EQ(keptDims.shape(), Shape({1, 1}));
-    ASSERT_EQ(keptDims.scalar<char>(), fl::all(t, {0, 1}).scalar<char>());
-    auto q = fl::all(fl::full({5, 5, 5, 5}, 1));
+    ASSERT_EQ(keptDims.scalar<char>(), fl::all_of(t, {0, 1}).scalar<char>());
+    auto q = fl::all_of(fl::full({5, 5, 5, 5}, 1));
     ASSERT_EQ(q.shape(), Shape({}));
     ASSERT_EQ(q.elements(), 1);
     ASSERT_EQ(q.scalar<char>(), true);
 
     const float v = 3.14;
-    auto a = fl::all(fl::fromScalar(v));
+    auto a = fl::all_of(fl::fromScalar(v));
     ASSERT_EQ(a.shape(), Shape());
     ASSERT_TRUE(a.scalar<char>());
-    ASSERT_EQ(fl::all(fl::fromScalar(v), {0}).shape(), Shape());
+    ASSERT_EQ(fl::all_of(fl::fromScalar(v), {0}).shape(), Shape());
 }
 
 int main(int argc, char** argv) {
diff --git a/flashlight/fl/test/tensor/TensorUnaryOpsTest.cpp b/flashlight/fl/test/tensor/TensorUnaryOpsTest.cpp
index c38a171..7392a33 100644
--- a/flashlight/fl/test/tensor/TensorUnaryOpsTest.cpp
+++ b/flashlight/fl/test/tensor/TensorUnaryOpsTest.cpp
@@ -27,36 +27,36 @@ TEST(TensorUnaryOpsTest, logicalNot) {
     ASSERT_TRUE(
         allClose(
             !fl::full({3, 3}, true),
-            fl::full({3, 3}, false).astype(dtype::b8)
+            fl::full({3, 3}, false).asType(dtype::b8)
         )
     );
 }
 
 TEST(TensorUnaryOpsTest, clip) {
-    float h = 3.;
-    float l = 2.;
+    float h = 3.f;
+    float l = 2.f;
     Shape s = {3, 3};
     auto high = fl::full(s, h);
     auto low = fl::full(s, l);
-    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.), low, high), high));
-    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.), l, high), high));
-    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.), low, h), high));
-    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.), l, h), high));
+    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.f), low, high), high));
+    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.f), l, high), high));
+    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.f), low, h), high));
+    ASSERT_TRUE(allClose(fl::clip(fl::full({3, 3}, 4.f), l, h), high));
 }
 
 TEST(TensorUnaryOpsTest, roll) {
-    auto t = fl::full({5, 5}, 4.);
+    auto t = fl::full({5, 5}, 4.f);
     ASSERT_TRUE(allClose(t, fl::roll(t, /* shift = */ 3, /* axis = */ 1)));
 
     Shape dims({4, 5});
-    auto r = fl::arange(dims);
+    auto r = fl::arrange(dims);
     auto result = fl::roll(r, /* shift = */ 1, /* axis = */ 0);
     ASSERT_EQ(r.shape(), result.shape());
     ASSERT_TRUE(allClose(result(0), fl::full({dims[1]}, dims[0] - 1, r.type())));
     ASSERT_TRUE(
         allClose(
             result(fl::range(1, fl::end)),
-            fl::arange({dims[0] - 1, dims[1]}, /* seqDim = */ 0, r.type())
+            fl::arrange({dims[0] - 1, dims[1]}, /* seqDim = */ 0, r.type())
         )
     );
 }
@@ -65,8 +65,8 @@ TEST(TensorUnaryOpsTest, isnan) {
     Shape s = {3, 3};
     ASSERT_TRUE(
         allClose(
-            fl::isnan(fl::full(s, 1.) / 3),
-            fl::full(s, false).astype(fl::dtype::b8)
+            fl::isnan(fl::full(s, 1.f) / 3),
+            fl::full(s, false).asType(fl::dtype::b8)
         )
     );
 }
@@ -75,14 +75,14 @@ TEST(TensorUnaryOpsTest, isinf) {
     Shape s = {3, 3};
     ASSERT_TRUE(
         allClose(
-            fl::isinf(fl::full(s, 1.) / 3),
-            fl::full(s, false).astype(fl::dtype::b8)
+            fl::isinf(fl::full(s, 1.f) / 3),
+            fl::full(s, false).asType(fl::dtype::b8)
         )
     );
     ASSERT_TRUE(
         allClose(
-            fl::isinf(fl::full(s, 1.) / 0.),
-            fl::full(s, true).astype(fl::dtype::b8)
+            fl::isinf(fl::full(s, 1.f) / 0.f),
+            fl::full(s, true).asType(fl::dtype::b8)
         )
     );
 }
@@ -102,7 +102,7 @@ TEST(TensorUnaryOpsTest, tril) {
         [](const Dim dim, const Tensor& res, const Tensor& in) {
             for(int i = 0; i < dim; ++i)
                 for(int j = i + 1; j < dim; ++j)
-                    ASSERT_EQ(res(i, j).scalar<float>(), 0.);
+                    ASSERT_EQ(res(i, j).scalar<float>(), 0.f);
             for(int i = 0; i < dim; ++i)
                 for(int j = 0; j < i; ++j)
                     ASSERT_TRUE(allClose(res(i, j), in(i, j)));
@@ -133,7 +133,7 @@ TEST(TensorUnaryOpsTest, triu) {
                     ASSERT_TRUE(allClose(res(i, j), in(i, j)));
             for(unsigned i = 0; i < dim; ++i)
                 for(unsigned j = 0; j < i; ++j)
-                    ASSERT_EQ(res(i, j).scalar<float>(), 0.);
+                    ASSERT_EQ(res(i, j).scalar<float>(), 0.f);
         };
 
     int dim = 10;
@@ -156,20 +156,20 @@ TEST(TensorUnaryOpsTest, triu) {
 
 TEST(TensorUnaryOpsTest, floor) {
     auto a = fl::rand({10, 10}) + 0.5;
-    ASSERT_TRUE(allClose((a >= 1.).astype(fl::dtype::f32), fl::floor(a)));
+    ASSERT_TRUE(allClose((a >= 1.).asType(fl::dtype::f32), fl::floor(a)));
 }
 
 TEST(TensorUnaryOpsTest, ceil) {
     auto a = fl::rand({10, 10}) + 0.5;
-    ASSERT_TRUE(allClose((a >= 1).astype(fl::dtype::f32), fl::ceil(a) - 1));
+    ASSERT_TRUE(allClose((a >= 1).asType(fl::dtype::f32), fl::ceil(a) - 1));
 }
 
 TEST(TensorUnaryOpsTest, rint) {
     Shape s = {10, 10};
     auto a = fl::rand(s) - 0.5;
-    ASSERT_TRUE(allClose(fl::rint(a), fl::full(s, 0.)));
+    ASSERT_TRUE(allClose(fl::rint(a), fl::full(s, 0.f)));
     auto b = fl::rand(s) + 0.5;
-    ASSERT_TRUE(allClose(fl::rint(b), fl::full(s, 1.)));
+    ASSERT_TRUE(allClose(fl::rint(b), fl::full(s, 1.f)));
 }
 
 TEST(TensorUnaryOpsTest, sigmoid) {
@@ -179,15 +179,15 @@ TEST(TensorUnaryOpsTest, sigmoid) {
 
 TEST(TensorUnaryOpsTest, flip) {
     const unsigned high = 10;
-    auto a = fl::arange({high});
+    auto a = fl::arrange({high});
     auto flipped = fl::flip(a, /* dim = */ 0);
     a *= -1;
     a += (high - 1);
     ASSERT_TRUE(allClose(a, flipped));
 
-    auto b = fl::arange({high, high}, /* seqDim = */ 0);
+    auto b = fl::arrange({high, high}, /* seqDim = */ 0);
     ASSERT_TRUE(allClose(fl::flip(b, 1), b));
-    auto c = fl::arange({high, high}, /* seqDim = */ 1);
+    auto c = fl::arrange({high, high}, /* seqDim = */ 1);
     ASSERT_TRUE(allClose(fl::flip(c, 0), c));
 }
 
diff --git a/flashlight/fl/test/tensor/af/ArrayFireTensorBaseTest.cpp b/flashlight/fl/test/tensor/af/ArrayFireTensorBaseTest.cpp
index 503d8d5..896c743 100644
--- a/flashlight/fl/test/tensor/af/ArrayFireTensorBaseTest.cpp
+++ b/flashlight/fl/test/tensor/af/ArrayFireTensorBaseTest.cpp
@@ -54,9 +54,7 @@ TEST(ArrayFireTensorBaseTest, ArrayFireShapeInterop) {
     ASSERT_EQ(detail::afToFlDims(af::dim4(0, 1, 1, 1), 4), Shape({0, 1, 1, 1}));
 
     using namespace fl::detail;
-    auto dimsEq = [](const af::dim4& d, const Shape& s) {
-            return detail::afToFlDims(d, s.ndim()) == s;
-        };
+    auto dimsEq = [](const af::dim4& d, const Shape& s) { return detail::afToFlDims(d, s.ndim()) == s; };
 
     ASSERT_TRUE(dimsEq(af::dim4(3), {3})); // not 3, 1, 1, 1
     ASSERT_TRUE(dimsEq(af::dim4(3, 2), {3, 2})); // not 3, 2, 1, 1
@@ -133,17 +131,17 @@ TEST(ArrayFireTensorBaseTest, AfRefCountModify) {
 TEST(ArrayFireTensorBaseTest, astypeRefcount) {
     auto t = fl::rand({5, 5});
     ASSERT_EQ(getRefCount(toArray(t)), 1);
-    auto t64 = t.astype(fl::dtype::f64);
+    auto t64 = t.asType(fl::dtype::f64);
     ASSERT_EQ(getRefCount(toArray(t64)), 1);
 }
 
 TEST(ArrayFireTensorBaseTest, astypeInPlaceRefcount) {
     auto a = fl::rand({4, 4});
     ASSERT_EQ(getRefCount(toArray(a)), 1);
-    a = a.astype(fl::dtype::f64);
+    a = a.asType(fl::dtype::f64);
     ASSERT_EQ(getRefCount(toArray(a)), 1);
     ASSERT_EQ(a.type(), fl::dtype::f64);
-    a = a.astype(fl::dtype::f32);
+    a = a.asType(fl::dtype::f32);
     ASSERT_EQ(getRefCount(toArray(a)), 1);
 }
 
@@ -208,15 +206,25 @@ TEST(ArrayFireTensorBaseTest, BinaryOperators) {
 
 TEST(ArrayFireTensorBaseTest, full) {
     // TODO: expand with fixtures for each type
-    auto a = fl::full({3, 4}, 3.);
-    ASSERT_EQ(a.shape(), Shape({3, 4}));
+    af::dim4 afDim2{3, 4};
+    Shape const dim2Shape{afDim2[0], afDim2[1]};
+    constexpr float val2 = 3;
+
+    auto a = fl::full(dim2Shape, val2);
+    ASSERT_EQ(a.shape(), dim2Shape);
     ASSERT_EQ(a.type(), fl::dtype::f32);
-    ASSERT_TRUE(allClose(toArray(a), af::constant(3., {3, 4})));
 
-    auto b = fl::full({1, 1, 5, 4}, 4.5);
-    ASSERT_EQ(b.shape(), Shape({1, 1, 5, 4}));
+    ASSERT_TRUE(allClose(toArray(a), af::constant(val2, afDim2)));
+
+
+    af::dim4 afDim4{1, 1, 5, 4};
+    Shape const dim4Shape{afDim4[0], afDim4[1], afDim4[2], afDim4[3]};
+    constexpr float val4 = 4.5;
+
+    auto b = fl::full(dim4Shape, val4);
+    ASSERT_EQ(b.shape(), dim4Shape);
     ASSERT_EQ(b.type(), fl::dtype::f32);
-    ASSERT_TRUE(allClose(toArray(b), af::constant(4.5, {1, 1, 5, 4})));
+    ASSERT_TRUE(allClose(toArray(b), af::constant(val4, afDim4)));
 }
 
 TEST(ArrayFireTensorBaseTest, identity) {
@@ -431,7 +439,7 @@ TEST(ArrayFireTensorBaseTest, tile) {
 }
 
 TEST(ArrayFireTensorBaseTest, nonzero) {
-    auto a = fl::rand({10, 10}).astype(fl::dtype::u32);
+    auto a = fl::rand({10, 10}).asType(fl::dtype::u32);
     auto nz = fl::nonzero(a);
     ASSERT_TRUE(allClose(toArray(nz), af::where(toArray(a))));
 }
diff --git a/flashlight/pkg/runtime/common/DistributedUtils.cpp b/flashlight/pkg/runtime/common/DistributedUtils.cpp
index 24512ac..8fcc3e6 100644
--- a/flashlight/pkg/runtime/common/DistributedUtils.cpp
+++ b/flashlight/pkg/runtime/common/DistributedUtils.cpp
@@ -52,13 +52,13 @@ Tensor allreduceGet(fl::AverageValueMeter& mtr) {
 
 Tensor allreduceGet(fl::EditDistanceMeter& mtr) {
     auto mtrVal0 = mtr.value();
-    std::vector<long long> mtrVal(mtrVal0.begin(), mtrVal0.end());
+    std::vector<int64_t> mtrVal(mtrVal0.begin(), mtrVal0.end());
     return Tensor::fromVector(mtrVal);
 }
 
 Tensor allreduceGet(fl::CountMeter& mtr) {
     auto mtrVal0 = mtr.value();
-    std::vector<long long> mtrVal(mtrVal0.begin(), mtrVal0.end());
+    std::vector<int64_t> mtrVal(mtrVal0.begin(), mtrVal0.end());
     return Tensor::fromVector(mtrVal);
 }
 
@@ -81,7 +81,7 @@ void allreduceSet(fl::AverageValueMeter& mtr, Tensor& val) {
 
 void allreduceSet(fl::EditDistanceMeter& mtr, Tensor& val) {
     mtr.reset();
-    auto valVec = val.toHostVector<long long>();
+    auto valVec = val.toHostVector<int64_t>();
     mtr.add(
         static_cast<int64_t>(valVec[1]),
         static_cast<int64_t>(valVec[2]),
@@ -92,7 +92,7 @@ void allreduceSet(fl::EditDistanceMeter& mtr, Tensor& val) {
 
 void allreduceSet(fl::CountMeter& mtr, Tensor& val) {
     mtr.reset();
-    auto valVec = val.toHostVector<long long>();
+    auto valVec = val.toHostVector<int64_t>();
     for(size_t i = 0; i < valVec.size(); ++i)
         mtr.add(i, valVec[i]);
 }
diff --git a/flashlight/pkg/speech/criterion/Seq2SeqCriterion.cpp b/flashlight/pkg/speech/criterion/Seq2SeqCriterion.cpp
index 7477bbf..d9ee052 100644
--- a/flashlight/pkg/speech/criterion/Seq2SeqCriterion.cpp
+++ b/flashlight/pkg/speech/criterion/Seq2SeqCriterion.cpp
@@ -172,7 +172,7 @@ std::vector<Variable> Seq2SeqCriterion::forward(
         size_t nClass = out.dim(0);
         auto targetTiled = fl::tile(
             fl::reshape(target.tensor(), {1, target.dim(0), target.dim(1)}),
-            {static_cast<long long>(nClass)}
+            {static_cast<int64_t>(nClass)}
         );
         out = applySeq2SeqMask(out, targetTiled, pad_);
         auto smoothLoss = moddims(sum(out, {0, 1}), {-1});
@@ -209,11 +209,11 @@ std::pair<Variable, Variable> Seq2SeqCriterion::vectorizedDecoder(
                 );
             else if(samplingStrategy_ == fl::pkg::speech::kRandSampling) {
                 auto mask = Variable(
-                    (fl::rand(y.shape()) * 100 <= pctTeacherForcing_).astype(y.type()),
+                    (fl::rand(y.shape()) * 100 <= pctTeacherForcing_).asType(y.type()),
                     false
                 );
                 auto samples = Variable(
-                    (fl::rand(y.shape()) * (nClass_ - 1)).astype(y.type()),
+                    (fl::rand(y.shape()) * (nClass_ - 1)).asType(y.type()),
                     false
                 );
 
@@ -285,7 +285,7 @@ std::pair<Variable, Variable> Seq2SeqCriterion::decoder(
             y = Variable(maxIdx, false);
         } else if(samplingStrategy_ == fl::pkg::speech::kRandSampling)
             y = Variable(
-                (fl::rand({1, target.dim(1)}) * (nClass_ - 1)).astype(fl::dtype::s32),
+                (fl::rand({1, target.dim(1)}) * (nClass_ - 1)).asType(fl::dtype::s32),
                 false
             );
         else
@@ -414,7 +414,7 @@ std::vector<Seq2SeqCriterion::CandidateHypo> Seq2SeqCriterion::beamSearch(
         ox = fl::reorder(ox, {0, 2, 1});
 
         auto scoreArr = Tensor::fromBuffer(
-            {1, static_cast<long long>(beam.size()), 1},
+            {1, static_cast<int64_t>(beam.size()), 1},
             prevScoreVec.data(),
             MemoryLocation::Host
         );
diff --git a/flashlight/pkg/speech/criterion/TransformerCriterion.cpp b/flashlight/pkg/speech/criterion/TransformerCriterion.cpp
index 4ed9a3d..1d431f7 100644
--- a/flashlight/pkg/speech/criterion/TransformerCriterion.cpp
+++ b/flashlight/pkg/speech/criterion/TransformerCriterion.cpp
@@ -90,7 +90,7 @@ std::vector<Variable> TransformerCriterion::forward(
         {-1}
     );
     if(train_ && labelSmooth_ > 0) {
-        long long nClass = out.dim(0);
+        auto nClass = out.dim(0);
         auto targetTiled = fl::tile(
             fl::reshape(target.tensor(), {1, target.dim(0), target.dim(1)}),
             {nClass}
@@ -123,11 +123,11 @@ std::pair<Variable, Variable> TransformerCriterion::vectorizedDecoder(
         if(train_) {
             // TODO: other sampling strategies
             auto mask = Variable(
-                (fl::rand(y.shape()) * 100 <= pctTeacherForcing_).astype(y.type()),
+                (fl::rand(y.shape()) * 100 <= pctTeacherForcing_).asType(y.type()),
                 false
             );
             auto samples = Variable(
-                (fl::rand(y.shape()) * (nClass_ - 1)).astype(y.type()),
+                (fl::rand(y.shape()) * (nClass_ - 1)).asType(y.type()),
                 false
             );
 
diff --git a/flashlight/pkg/speech/criterion/backend/cuda/ConnectionistTemporalClassificationCriterion.cpp b/flashlight/pkg/speech/criterion/backend/cuda/ConnectionistTemporalClassificationCriterion.cpp
index 3ce1bb8..2cea474 100644
--- a/flashlight/pkg/speech/criterion/backend/cuda/ConnectionistTemporalClassificationCriterion.cpp
+++ b/flashlight/pkg/speech/criterion/backend/cuda/ConnectionistTemporalClassificationCriterion.cpp
@@ -125,7 +125,7 @@ std::vector<Variable> ConnectionistTemporalClassificationCriterion::forward(
         "Error: get_workspace_size"
     );
 
-    Tensor workspace({static_cast<long long>(workspace_size)}, fl::dtype::b8);
+    Tensor workspace({static_cast<int64_t>(workspace_size)}, fl::dtype::b8);
 
     std::vector<float> costs(B, 0.0);
     {
diff --git a/flashlight/pkg/speech/criterion/backend/cuda/CriterionUtils.cpp b/flashlight/pkg/speech/criterion/backend/cuda/CriterionUtils.cpp
index bf1dd7e..f8ab063 100644
--- a/flashlight/pkg/speech/criterion/backend/cuda/CriterionUtils.cpp
+++ b/flashlight/pkg/speech/criterion/backend/cuda/CriterionUtils.cpp
@@ -44,7 +44,7 @@ Tensor viterbiPath(const Tensor& input, const Tensor& trans) {
 
     Tensor path({T, B}, fl::dtype::s32);
     Tensor workspace(
-        {static_cast<long long>(ViterbiPath::getWorkspaceSize(B, T, N))},
+        {static_cast<int64_t>(ViterbiPath::getWorkspaceSize(B, T, N))},
         fl::dtype::u8);
 
     {
diff --git a/flashlight/pkg/speech/criterion/backend/cuda/ForceAlignmentCriterion.cpp b/flashlight/pkg/speech/criterion/backend/cuda/ForceAlignmentCriterion.cpp
index 463712b..2cdbd93 100644
--- a/flashlight/pkg/speech/criterion/backend/cuda/ForceAlignmentCriterion.cpp
+++ b/flashlight/pkg/speech/criterion/backend/cuda/ForceAlignmentCriterion.cpp
@@ -91,7 +91,7 @@ Variable ForceAlignmentCriterion::forward(
     const auto& trans = transVar.tensor();
     Tensor loss({B}, fl::dtype::f32);
     Tensor workspace(
-        {static_cast<long long>(FAC::getWorkspaceSize(B, T, N, L))},
+        {static_cast<int64_t>(FAC::getWorkspaceSize(B, T, N, L))},
         fl::dtype::u8);
 
     {
@@ -155,7 +155,7 @@ Tensor ForceAlignmentCriterion::viterbiPath(
     const auto& trans = transVar.tensor();
     Tensor bestPathsVar({T, B}, fl::dtype::s32);
     Tensor workspace(
-        {static_cast<long long>(FAC::getWorkspaceSize(B, T, N, L))},
+        {static_cast<int64_t>(FAC::getWorkspaceSize(B, T, N, L))},
         fl::dtype::u8);
 
     {
diff --git a/flashlight/pkg/speech/criterion/backend/cuda/FullConnectionCriterion.cpp b/flashlight/pkg/speech/criterion/backend/cuda/FullConnectionCriterion.cpp
index 6c63e8a..ab05dd0 100644
--- a/flashlight/pkg/speech/criterion/backend/cuda/FullConnectionCriterion.cpp
+++ b/flashlight/pkg/speech/criterion/backend/cuda/FullConnectionCriterion.cpp
@@ -96,7 +96,7 @@ Variable FullConnectionCriterion::forward(
     const auto& trans = transVar.tensor();
     Tensor loss({B}, fl::dtype::f32);
     Tensor workspace(
-        {static_cast<long long>(FCC::getWorkspaceSize(B, T, N))}, fl::dtype::u8);
+        {static_cast<int64_t>(FCC::getWorkspaceSize(B, T, N))}, fl::dtype::u8);
 
     {
         fl::DevicePtr inputRaw(input);
diff --git a/flashlight/pkg/speech/data/FeatureTransforms.cpp b/flashlight/pkg/speech/data/FeatureTransforms.cpp
index 01b0e7d..0274161 100644
--- a/flashlight/pkg/speech/data/FeatureTransforms.cpp
+++ b/flashlight/pkg/speech/data/FeatureTransforms.cpp
@@ -116,7 +116,7 @@ fl::Dataset::DataTransformFunction inputFeatures(
                else
                    output = normalize(output);
                return Tensor::fromBuffer(
-                   {static_cast<long long>(T), featSz, channels},
+                   {static_cast<int64_t>(T), featSz, channels},
                    output.data(),
                    MemoryLocation::Host
                );
diff --git a/flashlight/pkg/speech/data/ListFileDataset.cpp b/flashlight/pkg/speech/data/ListFileDataset.cpp
index c8c56eb..0bf4df0 100644
--- a/flashlight/pkg/speech/data/ListFileDataset.cpp
+++ b/flashlight/pkg/speech/data/ListFileDataset.cpp
@@ -105,12 +105,12 @@ std::vector<Tensor> ListFileDataset::get(const int64_t idx) const {
     }
 
     Tensor sampleIdx = Tensor::fromBuffer(
-        {static_cast<long long>(ids_[idx].length())},
+        {static_cast<int64_t>(ids_[idx].length())},
         const_cast<char*>(ids_[idx].data()), // fix me post C++-17?
         MemoryLocation::Host
     );
     Tensor samplePath = Tensor::fromBuffer(
-        {static_cast<long long>(inputs_[idx].length())},
+        {static_cast<int64_t>(inputs_[idx].length())},
         inputs_[idx].data(),
         MemoryLocation::Host
     );
@@ -150,7 +150,7 @@ int64_t ListFileDataset::getTargetSize(const int64_t idx) const {
     std::vector<char> curTarget(targets_[idx].begin(), targets_[idx].end());
     auto tgtSize = tgtFeatFunc_(
         static_cast<void*>(curTarget.data()),
-        {static_cast<long long>(curTarget.size())},
+        {static_cast<int64_t>(curTarget.size())},
         fl::dtype::b8
     )
         .elements();
diff --git a/flashlight/pkg/speech/runtime/Logger.cpp b/flashlight/pkg/speech/runtime/Logger.cpp
index 625100f..8c3a5d0 100644
--- a/flashlight/pkg/speech/runtime/Logger.cpp
+++ b/flashlight/pkg/speech/runtime/Logger.cpp
@@ -120,7 +120,7 @@ void appendToLog(std::ofstream& logfile, const std::string& logstr) {
 
 Tensor allreduceGet(SpeechStatMeter& mtr) {
     auto mtrValRaw = mtr.value();
-    std::vector<long long> mtrVal(mtrValRaw.begin(), mtrValRaw.end());
+    std::vector<int64_t> mtrVal(mtrValRaw.begin(), mtrValRaw.end());
     // Caveat: maxInputSz_, maxTargetSz_ would be approximate
     mtrVal[2] *= mtrVal[4];
     mtrVal[3] *= mtrVal[4];
diff --git a/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp b/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp
index 980c8da..165fe1e 100644
--- a/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp
+++ b/flashlight/pkg/speech/test/criterion/attention/AttentionTest.cpp
@@ -245,7 +245,7 @@ TEST(AttentionTest, JacobianMaskAttention) {
     auto in = Variable(fl::rand({10, 9, 5}, fl::dtype::f32), true);
     std::vector<int> inpSzRaw = {1, 2, 4, 8, 16};
     Tensor inpSz = Tensor::fromVector(
-        {1, static_cast<long long>(inpSzRaw.size())},
+        {1, static_cast<int64_t>(inpSzRaw.size())},
         inpSzRaw
     );
     auto func_in = [&](Variable& input) {
diff --git a/flashlight/pkg/speech/test/data/FeaturizationTest.cpp b/flashlight/pkg/speech/test/data/FeaturizationTest.cpp
index 29b0bb1..4d9e015 100644
--- a/flashlight/pkg/speech/test/data/FeaturizationTest.cpp
+++ b/flashlight/pkg/speech/test/data/FeaturizationTest.cpp
@@ -430,7 +430,7 @@ TEST(FeaturizationTest, targetFeaturizer) {
 
     auto tgtArray = targetFeaturizer(
         targets[0].data(),
-        {static_cast<long long>(targets[0].size())},
+        {static_cast<int64_t>(targets[0].size())},
         fl::dtype::b8
     );
     int tgtLen = 5;
@@ -459,7 +459,7 @@ TEST(FeaturizationTest, targetFeaturizer) {
     targetFeaturizer = targetFeatures(tokenDict, lexicon, targetGenConfigEos);
     tgtArray = targetFeaturizer(
         targets[1].data(),
-        {static_cast<long long>(targets[1].size())},
+        {static_cast<int64_t>(targets[1].size())},
         fl::dtype::b8
     );
     tgtLen = 5;
diff --git a/flashlight/pkg/text/data/TextDataset.cpp b/flashlight/pkg/text/data/TextDataset.cpp
index ee2e604..9cd03de 100644
--- a/flashlight/pkg/text/data/TextDataset.cpp
+++ b/flashlight/pkg/text/data/TextDataset.cpp
@@ -159,7 +159,7 @@ std::vector<Tensor> TextDataset::get(const int64_t idx) const {
     }
     return {
         Tensor::fromVector(
-            {maxLength, static_cast<long long>(batch.size())},
+            {maxLength, static_cast<int64_t>(batch.size())},
             buffer
         )
     };
diff --git a/flashlight/pkg/vision/criterion/SetCriterion.cpp b/flashlight/pkg/vision/criterion/SetCriterion.cpp
index abaaa7d..78d6558 100644
--- a/flashlight/pkg/vision/criterion/SetCriterion.cpp
+++ b/flashlight/pkg/vision/criterion/SetCriterion.cpp
@@ -97,8 +97,9 @@ Tensor ravelIndices(
 Tensor index(const Tensor& in, const std::vector<Tensor>& idxs) {
     auto linearIndices = ravelIndices(idxs, in.shape());
     Tensor output = fl::full(linearIndices.shape(), 0., in.type());
-    output.flat(fl::range(static_cast<long long>(linearIndices.elements()))) =
-        in.flatten()(linearIndices);
+    output.flat(
+        fl::range(static_cast<int64_t>(linearIndices.elements()))
+    ) = in.flatten()(linearIndices);
     return output;
 }
 
@@ -116,7 +117,7 @@ fl::Variable index(const fl::Variable& in, std::vector<Tensor> idxs) {
             auto grad = fl::Variable(fl::full(idims, 0, inputs[0].type()), false);
             auto linearIndices = ravelIndices(idxs, idims);
             grad.tensor()(linearIndices) = grad_output.tensor()(
-                fl::range(static_cast<long long>(linearIndices.elements())));
+                fl::range(static_cast<int64_t>(linearIndices.elements())));
             // TODO Can parallize this if needed but does not work for duplicate keys
             // for(int i = 0; i < linearIndices.elements(); i++) {
             // Tensor index = linearIndices(i);
@@ -283,8 +284,8 @@ SetCriterion::LossDict SetCriterion::lossLabels(
         target_classes_full(srcIdxs, i) =
             fl::reshape(
                 targetClasses[i].tensor()(targetIdxs),
-                {static_cast<long long>(srcIdxs.elements()), 1})
-                .astype(target_classes_full.type());
+                {static_cast<int64_t>(srcIdxs.elements()), 1})
+                .asType(target_classes_full.type());
                 i += 1;
     }
 
@@ -294,7 +295,7 @@ SetCriterion::LossDict SetCriterion::lossLabels(
     auto weightVar = Variable(weight, false);
     auto lossCe = weightedCategoricalCrossEntropy(
         softmaxed,
-        fl::Variable(target_classes_full.astype(fl::dtype::f32), false),
+        fl::Variable(target_classes_full.asType(fl::dtype::f32), false),
         weightVar,
         -1
     );
@@ -306,7 +307,7 @@ std::unordered_map<std::string, float> SetCriterion::getWeightDict() { return we
 std::pair<Tensor, Tensor> SetCriterion::getTgtPermutationIdx(
     const std::vector<std::pair<Tensor, Tensor>>& indices
 ) {
-    long batchSize = static_cast<long>(indices.size());
+    auto batchSize = static_cast<int64_t>(indices.size());
     auto batchIdxs = fl::full({1, 1, 1, batchSize}, -1);
     auto first = indices[0].first;
     auto dims = first.shape();
diff --git a/flashlight/pkg/vision/dataset/Coco.cpp b/flashlight/pkg/vision/dataset/Coco.cpp
index 4feb563..86ac708 100644
--- a/flashlight/pkg/vision/dataset/Coco.cpp
+++ b/flashlight/pkg/vision/dataset/Coco.cpp
@@ -39,13 +39,13 @@ std::pair<Tensor, Tensor> makeImageAndMaskBatch(
         maxH = std::max(h, maxH);
     }
 
-    Shape outDims = {maxW, maxH, 3, static_cast<long>(data.size())};
-    Shape maskDims = {maxW, maxH, 1, static_cast<long>(data.size())};
+    Shape outDims = {maxW, maxH, 3, static_cast<Dim>(data.size())};
+    Shape maskDims = {maxW, maxH, 1, static_cast<Dim>(data.size())};
 
     auto batcharr = fl::full(outDims, 0);
     auto maskarr = fl::full(maskDims, 0);
 
-    for(long i = 0; i < data.size(); ++i) {
+    for(Dim i = 0; i < data.size(); ++i) {
         Tensor sample = data[i];
         Shape dims = sample.shape();
         int w = dims[0];
diff --git a/flashlight/pkg/vision/dataset/CocoTransforms.cpp b/flashlight/pkg/vision/dataset/CocoTransforms.cpp
index 50095ed..219c2b0 100644
--- a/flashlight/pkg/vision/dataset/CocoTransforms.cpp
+++ b/flashlight/pkg/vision/dataset/CocoTransforms.cpp
@@ -157,7 +157,7 @@ std::vector<Tensor> randomResize(std::vector<Tensor> inputs, int size, int maxsi
         boxes = boxes * resizedArray;
     }
 
-    std::vector<long> imageSizeArray = {resizedImage.dim(1), resizedImage.dim(0)};
+    std::vector<Dim> imageSizeArray = {resizedImage.dim(1), resizedImage.dim(0)};
     Tensor sizeArray = Tensor::fromVector(imageSizeArray);
     return {
         resizedImage,
@@ -190,7 +190,7 @@ TransformAllFunction Normalize(
                    boxes = boxes / ratioArray;
                }
                // Normalize Image
-               Tensor image = in[ImageIdx].astype(fl::dtype::f32) / 255.f;
+               Tensor image = in[ImageIdx].asType(fl::dtype::f32) / 255.f;
                image = image - mean;
                image = image / std;
                std::vector<Tensor> outputs = {
diff --git a/flashlight/pkg/vision/dataset/Transforms.cpp b/flashlight/pkg/vision/dataset/Transforms.cpp
index 5519158..5e9099f 100644
--- a/flashlight/pkg/vision/dataset/Transforms.cpp
+++ b/flashlight/pkg/vision/dataset/Transforms.cpp
@@ -170,8 +170,8 @@ Tensor posterize(const Tensor& input, const int bitsToKeep) {
     if(bitsToKeep < 1 || bitsToKeep > 8)
         throw std::invalid_argument("bitsToKeep needs to be in [1, 8]");
     uint8_t mask = ~((1 << (8 - bitsToKeep)) - 1);
-    auto res = input.astype(fl::dtype::u8) && mask;
-    return res.astype(input.type());
+    auto res = input.asType(fl::dtype::u8) && mask;
+    return res.asType(input.type());
 }
 
 Tensor sharpnessEnhance(const Tensor& input, const float enhance) {
@@ -312,7 +312,7 @@ ImageTransform randomHorizontalFlipTransform(const float p) {
     return [p](const Tensor& in) {
                Tensor out = in;
                if(static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX) > p) {
-                   const long long w = in.dim(0);
+                   auto const w = in.dim(0);
                    // reverse indices - w --> 0 - TODO: use fl::flip?
                    out = out(fl::range(w - 1, 1, -1));
                }
@@ -382,7 +382,7 @@ ImageTransform normalizeImage(
     const Tensor mean = Tensor::fromVector({1, 1, 3}, meanVector);
     const Tensor std = Tensor::fromVector({1, 1, 3}, stdVector);
     return [mean, std](const Tensor& in) {
-               Tensor out = in.astype(fl::dtype::f32) / 255.f;
+               Tensor out = in.asType(fl::dtype::f32) / 255.f;
                out = out - mean;
                out = out / std;
                return out;
@@ -519,7 +519,7 @@ ImageTransform randomAugmentationDeitTransform(
 
                        res = sharpnessEnhance(res, enhance);
                    }
-                   res = fl::clip(res, 0., 255.).astype(res.type());
+                   res = fl::clip(res, 0., 255.).asType(res.type());
                }
                return res;
            };