pytorch
diff --git a/‎docs/conf.py‎
Lines changed: 3 additions & 0 deletions b/‎docs/conf.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/README.rst‎
Lines changed: 8 additions & 28 deletions b/‎examples/README.rst‎
Lines changed: 8 additions & 28 deletions
diff --git a/‎examples/add.py‎
Lines changed: 15 additions & 5 deletions b/‎examples/add.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎examples/all_gather_matmul.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/all_gather_matmul.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/all_reduce.py‎
Lines changed: 17 additions & 4 deletions b/‎examples/all_reduce.py‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎examples/attention.py‎
Lines changed: 18 additions & 6 deletions b/‎examples/attention.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎examples/bmm.py‎
Lines changed: 15 additions & 5 deletions b/‎examples/bmm.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎examples/concatenate.py‎
Lines changed: 11 additions & 4 deletions b/‎examples/concatenate.py‎
Lines changed: 11 additions & 4 deletions
@@ -176,6 +176,9 @@ def connect(self, event: str, callback: Callable[..., None]) -> None:
 # Output directory for HTML files
 html_output_dir = "../site"
 
+# Base URL for sitemap and canonical links
+html_baseurl = "https://helionlang.com/"
+
 # -- Options for autodoc extension ------------------------------------------
 
 autodoc_default_options = {
 
@@ -1,11 +1,11 @@
 Helion Examples
-==============
+===============
 
 This directory contains examples demonstrating how to use Helion for high-performance tensor operations.
 The examples are organized into the following categories:
 
 Basic Operations
-~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~
 
 - :doc:`add.py <add>`: Element-wise addition with broadcasting support
 - :doc:`exp.py <exp>`: Element-wise exponential function
@@ -15,7 +15,7 @@ Basic Operations
 
 
 Matrix Multiplication Operations
-~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 - :doc:`matmul.py <matmul>`: Basic matrix multiplication
 - :doc:`bmm.py <bmm>`: Batch matrix multiplication
@@ -24,13 +24,13 @@ Matrix Multiplication Operations
 - :doc:`fp8_gemm.py <fp8_gemm>`: Matrix multiplication using FP8 precision
 
 Attention Operations
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~
 
 - :doc:`attention.py <attention>`: Scaled dot-product attention mechanism
 - :doc:`fp8_attention.py <fp8_attention>`: Attention mechanism using FP8 precision
 
 Normalization
-~~~~~~~~~~~~
+~~~~~~~~~~~~~
 
 - :doc:`rms_norm.py <rms_norm>`: Root Mean Square (RMS) normalization
 
@@ -43,7 +43,7 @@ Sparse and Jagged Tensors
 - :doc:`moe_matmul_ogs.py <moe_matmul_ogs>`: Mixture-of-Experts matrix multiplication using Outer-Gather-Scatter
 
 Other Operations
-~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~
 
 - :doc:`concatenate.py <concatenate>`: Tensor concatenation along a dimension
 - :doc:`cross_entropy.py <cross_entropy>`: Cross entropy loss function
@@ -55,26 +55,6 @@ Other Operations
    :maxdepth: 2
    :caption: Contents
    :hidden:
+   :glob:
 
-   add
-   all_gather_matmul
-   all_reduce
-   attention
-   bmm
-   concatenate
-   cross_entropy
-   embedding
-   exp
-   fp8_attention
-   fp8_gemm
-   jagged_dense_add
-   jagged_mean
-   long_sum
-   matmul
-   matmul_layernorm
-   matmul_split_k
-   moe_matmul_ogs
-   rms_norm
-   segment_reduction
-   softmax
-   sum
+   *
@@ -1,13 +1,15 @@
 """
 Element-wise Addition Example
-===========================
+=============================
 
 This example demonstrates how to implement an element-wise addition kernel using Helion.
 """
 
 # %%
 # Imports
 # -------
+
+# %%
 from __future__ import annotations
 
 import torch
@@ -16,10 +18,12 @@
 from helion._testing import run_example
 import helion.language as hl
 
-
 # %%
 # Addition Kernel
-# --------------
+# ---------------
+
+
+# %%
 @helion.kernel()
 def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """
@@ -48,7 +52,10 @@ def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
 # %%
 # Verification Function
-# -------------------
+# ---------------------
+
+
+# %%
 def check(m: int, n: int) -> None:
     """
     Verify the add kernel implementation against PyTorch's native add function.
@@ -64,7 +71,10 @@ def check(m: int, n: int) -> None:
 
 # %%
 # Main Function
-# -----------
+# -------------
+
+
+# %%
 def main() -> None:
     """
     Main entry point that runs the add kernel verification with 1024x1024 tensors.
 
@@ -9,6 +9,8 @@
 # %%
 # Imports
 # -------
+
+# %%
 from __future__ import annotations
 
 import os
 
@@ -1,6 +1,6 @@
 """
 One-Shot All-Reduce Example
-========================================
+===========================
 This example demonstrates how to implement a one-shot pulling all-reduce operation
 using Helion and PyTorch's distributed capabilities. It includes a Helion kernel
 demonstrating how to do cross-device synchronization using symmetric memory signal pads
@@ -10,6 +10,8 @@
 # %%
 # Imports
 # -------
+
+# %%
 from __future__ import annotations
 
 import os
@@ -24,6 +26,8 @@
 
 # %%
 # Work around before symm mem natively supports extract dev_ptrs as tensors: from_blob
+
+# %%
 from_blob_cpp = """
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -72,7 +76,10 @@ def dev_array_to_tensor_short(
 
 # %%
 # One Shot All-Reduce Kernel Implementation
-# ----------------------------------------
+# -----------------------------------------
+
+
+# %%
 @helion.jit(
     config=helion.Config(
         block_sizes=[8192],
@@ -159,7 +166,10 @@ def one_shot_all_reduce_kernel(
 
 # %%
 # Attract tensors from symmetric memory handler
-# ----------------------------------------
+# ---------------------------------------------
+
+
+# %%
 def helion_one_shot_all_reduce(a_shared: torch.Tensor) -> torch.Tensor:
     """
     Prepares symmetric memory tensors for Helion one-shot all-reduce kernel.
@@ -203,7 +213,10 @@ def helion_one_shot_all_reduce(a_shared: torch.Tensor) -> torch.Tensor:
 
 # %%
 # Testing Function
-# ----------------------------------------
+# ----------------
+
+
+# %%
 def test(N: int, device: torch.device, dtype: torch.dtype) -> None:
     """
     Test the Helion all-reduce implementation against PyTorch's reference implementation.
 
@@ -1,6 +1,6 @@
 """
 Attention Example
-========================
+=================
 
 This code implements a custom attention kernel using Helion and PyTorch for efficient computation of scaled dot-product attention,
 with support for both static and dynamic input shapes.
@@ -9,6 +9,8 @@
 # %%
 # Imports
 # -------
+
+# %%
 from __future__ import annotations
 
 import math
@@ -22,10 +24,12 @@
 from helion._testing import run_example
 import helion.language as hl
 
-
 # %%
 # Attention Kernel Implementation
-# ----------------------------
+# -------------------------------
+
+
+# %%
 @helion.kernel(
     # Static shapes provides a speedup for attention
     static_shapes=True,
@@ -86,7 +90,9 @@ def attention(
 
 # %%
 # Dynamic Shape Version
-# ------------------
+# ---------------------
+
+# %%
 attention_dynamic: object = helion.kernel(  # pyright: ignore[reportCallIssue]
     attention.fn,
     configs=attention.configs,  # pyright: ignore[reportArgumentType]
@@ -100,7 +106,10 @@ def attention(
 
 # %%
 # Testing Function
-# -------------
+# ----------------
+
+
+# %%
 def test(
     z: int,
     h: int,
@@ -147,7 +156,10 @@ def ref_attention(
 
 # %%
 # Main Function
-# -----------
+# -------------
+
+
+# %%
 def main() -> None:
     """
     Main entry point that runs the attention kernel test with specific parameters.
 
@@ -1,13 +1,15 @@
 """
 Batch Matrix Multiplication Example
-===============================
+===================================
 
 This example demonstrates how to implement a batch matrix multiplication kernel using Helion.
 """
 
 # %%
 # Imports
 # -------
+
+# %%
 from __future__ import annotations
 
 from packaging import version
@@ -17,11 +19,13 @@
 from helion._testing import run_example
 import helion.language as hl
 
-
 # %%
 # Batch Matrix Multiplication Kernel
-# -------------------------------
+# ----------------------------------
 # static_shapes=True gives a performance boost for matmuls
+
+
+# %%
 @helion.kernel(static_shapes=True)
 def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
     """
@@ -52,7 +56,10 @@ def bmm(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
 
 # %%
 # Verification Function
-# -------------------
+# ---------------------
+
+
+# %%
 def check(b: int, m: int, k: int, n: int) -> None:
     """
     Verify the bmm kernel implementation against PyTorch's native bmm function.
@@ -70,7 +77,10 @@ def check(b: int, m: int, k: int, n: int) -> None:
 
 # %%
 # Main Function
-# -----------
+# -------------
+
+
+# %%
 def main() -> None:
     """
     Main entry point that runs the bmm kernel verification with specific parameters.
 
@@ -1,13 +1,15 @@
 """
 Tensor Concatenation Example
-========================
+============================
 
 This example demonstrates how to implement a tensor concatenation operation using Helion.
 """
 
 # %%
 # Imports
 # -------
+
+# %%
 from __future__ import annotations
 
 import torch
@@ -16,10 +18,12 @@
 from helion._testing import run_example
 import helion.language as hl
 
-
 # %%
 # Concatenation Kernel
-# -----------------
+# --------------------
+
+
+# %%
 @helion.kernel()
 def concat2d_dim1(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """
@@ -54,7 +58,10 @@ def concat2d_dim1(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
 # %%
 # Main Function
-# -----------
+# -------------
+
+
+# %%
 def main() -> None:
     """
     Main entry point that runs the concatenation kernel verification.