diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 559c6d5a8b..001e3a84d8 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -14,7 +14,7 @@ inputs:
   cuda-components:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,libnvfatbin"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
new file mode 100644
index 0000000000..9f3b187560
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
+
+from ..cynvfatbin cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef const char* _nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil
+cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
new file mode 100644
index 0000000000..1e1e8d9a12
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -0,0 +1,344 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uintptr_t
+
+import threading
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nvfatbin_init = False
+
+cdef void* __nvFatbinGetErrorString = NULL
+cdef void* __nvFatbinCreate = NULL
+cdef void* __nvFatbinDestroy = NULL
+cdef void* __nvFatbinAddPTX = NULL
+cdef void* __nvFatbinAddCubin = NULL
+cdef void* __nvFatbinAddLTOIR = NULL
+cdef void* __nvFatbinSize = NULL
+cdef void* __nvFatbinGet = NULL
+cdef void* __nvFatbinVersion = NULL
+cdef void* __nvFatbinAddReloc = NULL
+cdef void* __nvFatbinAddTileIR = NULL
+
+
+cdef void* load_library() except* with gil:
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvfatbin")._handle_uint
+    return <void*>handle
+
+
+cdef int _init_nvfatbin() except -1 nogil:
+    global __py_nvfatbin_init
+
+    cdef void* handle = NULL
+
+    with gil, __symbol_lock:
+        # Recheck the flag after obtaining the locks
+        if __py_nvfatbin_init:
+            return 0
+
+        # Load function
+        global __nvFatbinGetErrorString
+        __nvFatbinGetErrorString = dlsym(RTLD_DEFAULT, 'nvFatbinGetErrorString')
+        if __nvFatbinGetErrorString == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinGetErrorString = dlsym(handle, 'nvFatbinGetErrorString')
+
+        global __nvFatbinCreate
+        __nvFatbinCreate = dlsym(RTLD_DEFAULT, 'nvFatbinCreate')
+        if __nvFatbinCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinCreate = dlsym(handle, 'nvFatbinCreate')
+
+        global __nvFatbinDestroy
+        __nvFatbinDestroy = dlsym(RTLD_DEFAULT, 'nvFatbinDestroy')
+        if __nvFatbinDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinDestroy = dlsym(handle, 'nvFatbinDestroy')
+
+        global __nvFatbinAddPTX
+        __nvFatbinAddPTX = dlsym(RTLD_DEFAULT, 'nvFatbinAddPTX')
+        if __nvFatbinAddPTX == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddPTX = dlsym(handle, 'nvFatbinAddPTX')
+
+        global __nvFatbinAddCubin
+        __nvFatbinAddCubin = dlsym(RTLD_DEFAULT, 'nvFatbinAddCubin')
+        if __nvFatbinAddCubin == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddCubin = dlsym(handle, 'nvFatbinAddCubin')
+
+        global __nvFatbinAddLTOIR
+        __nvFatbinAddLTOIR = dlsym(RTLD_DEFAULT, 'nvFatbinAddLTOIR')
+        if __nvFatbinAddLTOIR == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddLTOIR = dlsym(handle, 'nvFatbinAddLTOIR')
+
+        global __nvFatbinSize
+        __nvFatbinSize = dlsym(RTLD_DEFAULT, 'nvFatbinSize')
+        if __nvFatbinSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinSize = dlsym(handle, 'nvFatbinSize')
+
+        global __nvFatbinGet
+        __nvFatbinGet = dlsym(RTLD_DEFAULT, 'nvFatbinGet')
+        if __nvFatbinGet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinGet = dlsym(handle, 'nvFatbinGet')
+
+        global __nvFatbinVersion
+        __nvFatbinVersion = dlsym(RTLD_DEFAULT, 'nvFatbinVersion')
+        if __nvFatbinVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinVersion = dlsym(handle, 'nvFatbinVersion')
+
+        global __nvFatbinAddReloc
+        __nvFatbinAddReloc = dlsym(RTLD_DEFAULT, 'nvFatbinAddReloc')
+        if __nvFatbinAddReloc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddReloc = dlsym(handle, 'nvFatbinAddReloc')
+
+        global __nvFatbinAddTileIR
+        __nvFatbinAddTileIR = dlsym(RTLD_DEFAULT, 'nvFatbinAddTileIR')
+        if __nvFatbinAddTileIR == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddTileIR = dlsym(handle, 'nvFatbinAddTileIR')
+
+        __py_nvfatbin_init = True
+        return 0
+
+
+cdef inline int _check_or_init_nvfatbin() except -1 nogil:
+    if __py_nvfatbin_init:
+        return 0
+
+    return _init_nvfatbin()
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvfatbin()
+    cdef dict data = {}
+
+    global __nvFatbinGetErrorString
+    data["__nvFatbinGetErrorString"] = <intptr_t>__nvFatbinGetErrorString
+
+    global __nvFatbinCreate
+    data["__nvFatbinCreate"] = <intptr_t>__nvFatbinCreate
+
+    global __nvFatbinDestroy
+    data["__nvFatbinDestroy"] = <intptr_t>__nvFatbinDestroy
+
+    global __nvFatbinAddPTX
+    data["__nvFatbinAddPTX"] = <intptr_t>__nvFatbinAddPTX
+
+    global __nvFatbinAddCubin
+    data["__nvFatbinAddCubin"] = <intptr_t>__nvFatbinAddCubin
+
+    global __nvFatbinAddLTOIR
+    data["__nvFatbinAddLTOIR"] = <intptr_t>__nvFatbinAddLTOIR
+
+    global __nvFatbinSize
+    data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
+
+    global __nvFatbinGet
+    data["__nvFatbinGet"] = <intptr_t>__nvFatbinGet
+
+    global __nvFatbinVersion
+    data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
+
+    global __nvFatbinAddReloc
+    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
+
+    global __nvFatbinAddTileIR
+    data["__nvFatbinAddTileIR"] = <intptr_t>__nvFatbinAddTileIR
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef const char* _nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil:
+    global __nvFatbinGetErrorString
+    _check_or_init_nvfatbin()
+    if __nvFatbinGetErrorString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGetErrorString is not found")
+    return (<const char* (*)(nvFatbinResult) noexcept nogil>__nvFatbinGetErrorString)(
+        result)
+
+
+cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinCreate
+    _check_or_init_nvfatbin()
+    if __nvFatbinCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinCreate is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*, const char**, size_t) noexcept nogil>__nvFatbinCreate)(
+        handle_indirect, options, optionsCount)
+
+
+cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinDestroy
+    _check_or_init_nvfatbin()
+    if __nvFatbinDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinDestroy is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*) noexcept nogil>__nvFatbinDestroy)(
+        handle_indirect)
+
+
+cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddPTX
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddPTX == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddPTX is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const char*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddPTX)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddCubin
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddCubin is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddCubin)(
+        handle, code, size, arch, identifier)
+
+
+cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddLTOIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddLTOIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddLTOIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddLTOIR)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinSize
+    _check_or_init_nvfatbin()
+    if __nvFatbinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinSize is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, size_t*) noexcept nogil>__nvFatbinSize)(
+        handle, size)
+
+
+cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinGet
+    _check_or_init_nvfatbin()
+    if __nvFatbinGet == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGet is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, void*) noexcept nogil>__nvFatbinGet)(
+        handle, buffer)
+
+
+cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinVersion
+    _check_or_init_nvfatbin()
+    if __nvFatbinVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinVersion is not found")
+    return (<nvFatbinResult (*)(unsigned int*, unsigned int*) noexcept nogil>__nvFatbinVersion)(
+        major, minor)
+
+
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddReloc
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddReloc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
+        handle, code, size)
+
+
+cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddTileIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddTileIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddTileIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddTileIR)(
+        handle, code, size, identifier, optionsCmdLine)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
new file mode 100644
index 0000000000..b0b3f94e5a
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -0,0 +1,315 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+import threading
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in nvcuda.dll')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nvfatbin_init = False
+
+cdef void* __nvFatbinGetErrorString = NULL
+cdef void* __nvFatbinCreate = NULL
+cdef void* __nvFatbinDestroy = NULL
+cdef void* __nvFatbinAddPTX = NULL
+cdef void* __nvFatbinAddCubin = NULL
+cdef void* __nvFatbinAddLTOIR = NULL
+cdef void* __nvFatbinSize = NULL
+cdef void* __nvFatbinGet = NULL
+cdef void* __nvFatbinVersion = NULL
+cdef void* __nvFatbinAddReloc = NULL
+cdef void* __nvFatbinAddTileIR = NULL
+
+
+cdef int _init_nvfatbin() except -1 nogil:
+    global __py_nvfatbin_init
+
+    with gil, __symbol_lock:
+        # Recheck the flag after obtaining the locks
+        if __py_nvfatbin_init:
+            return 0
+
+        # Load library
+        handle = load_nvidia_dynamic_lib("nvfatbin")._handle_uint
+
+        # Load function
+        global __nvFatbinGetErrorString
+        __nvFatbinGetErrorString = GetProcAddress(handle, 'nvFatbinGetErrorString')
+
+        global __nvFatbinCreate
+        __nvFatbinCreate = GetProcAddress(handle, 'nvFatbinCreate')
+
+        global __nvFatbinDestroy
+        __nvFatbinDestroy = GetProcAddress(handle, 'nvFatbinDestroy')
+
+        global __nvFatbinAddPTX
+        __nvFatbinAddPTX = GetProcAddress(handle, 'nvFatbinAddPTX')
+
+        global __nvFatbinAddCubin
+        __nvFatbinAddCubin = GetProcAddress(handle, 'nvFatbinAddCubin')
+
+        global __nvFatbinAddLTOIR
+        __nvFatbinAddLTOIR = GetProcAddress(handle, 'nvFatbinAddLTOIR')
+
+        global __nvFatbinSize
+        __nvFatbinSize = GetProcAddress(handle, 'nvFatbinSize')
+
+        global __nvFatbinGet
+        __nvFatbinGet = GetProcAddress(handle, 'nvFatbinGet')
+
+        global __nvFatbinVersion
+        __nvFatbinVersion = GetProcAddress(handle, 'nvFatbinVersion')
+
+        global __nvFatbinAddReloc
+        __nvFatbinAddReloc = GetProcAddress(handle, 'nvFatbinAddReloc')
+
+        global __nvFatbinAddTileIR
+        __nvFatbinAddTileIR = GetProcAddress(handle, 'nvFatbinAddTileIR')
+
+        __py_nvfatbin_init = True
+        return 0
+
+
+cdef inline int _check_or_init_nvfatbin() except -1 nogil:
+    if __py_nvfatbin_init:
+        return 0
+
+    return _init_nvfatbin()
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvfatbin()
+    cdef dict data = {}
+
+    global __nvFatbinGetErrorString
+    data["__nvFatbinGetErrorString"] = <intptr_t>__nvFatbinGetErrorString
+
+    global __nvFatbinCreate
+    data["__nvFatbinCreate"] = <intptr_t>__nvFatbinCreate
+
+    global __nvFatbinDestroy
+    data["__nvFatbinDestroy"] = <intptr_t>__nvFatbinDestroy
+
+    global __nvFatbinAddPTX
+    data["__nvFatbinAddPTX"] = <intptr_t>__nvFatbinAddPTX
+
+    global __nvFatbinAddCubin
+    data["__nvFatbinAddCubin"] = <intptr_t>__nvFatbinAddCubin
+
+    global __nvFatbinAddLTOIR
+    data["__nvFatbinAddLTOIR"] = <intptr_t>__nvFatbinAddLTOIR
+
+    global __nvFatbinSize
+    data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
+
+    global __nvFatbinGet
+    data["__nvFatbinGet"] = <intptr_t>__nvFatbinGet
+
+    global __nvFatbinVersion
+    data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
+
+    global __nvFatbinAddReloc
+    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
+
+    global __nvFatbinAddTileIR
+    data["__nvFatbinAddTileIR"] = <intptr_t>__nvFatbinAddTileIR
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef const char* _nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil:
+    global __nvFatbinGetErrorString
+    _check_or_init_nvfatbin()
+    if __nvFatbinGetErrorString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGetErrorString is not found")
+    return (<const char* (*)(nvFatbinResult) noexcept nogil>__nvFatbinGetErrorString)(
+        result)
+
+
+cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinCreate
+    _check_or_init_nvfatbin()
+    if __nvFatbinCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinCreate is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*, const char**, size_t) noexcept nogil>__nvFatbinCreate)(
+        handle_indirect, options, optionsCount)
+
+
+cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinDestroy
+    _check_or_init_nvfatbin()
+    if __nvFatbinDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinDestroy is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*) noexcept nogil>__nvFatbinDestroy)(
+        handle_indirect)
+
+
+cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddPTX
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddPTX == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddPTX is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const char*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddPTX)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddCubin
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddCubin is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddCubin)(
+        handle, code, size, arch, identifier)
+
+
+cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddLTOIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddLTOIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddLTOIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddLTOIR)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinSize
+    _check_or_init_nvfatbin()
+    if __nvFatbinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinSize is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, size_t*) noexcept nogil>__nvFatbinSize)(
+        handle, size)
+
+
+cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinGet
+    _check_or_init_nvfatbin()
+    if __nvFatbinGet == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGet is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, void*) noexcept nogil>__nvFatbinGet)(
+        handle, buffer)
+
+
+cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinVersion
+    _check_or_init_nvfatbin()
+    if __nvFatbinVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinVersion is not found")
+    return (<nvFatbinResult (*)(unsigned int*, unsigned int*) noexcept nogil>__nvFatbinVersion)(
+        major, minor)
+
+
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddReloc
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddReloc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
+        handle, code, size)
+
+
+cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddTileIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddTileIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddTileIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddTileIR)(
+        handle, code, size, identifier, optionsCmdLine)
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
new file mode 100644
index 0000000000..5969fafee1
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uint32_t
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum nvFatbinResult "nvFatbinResult":
+    NVFATBIN_SUCCESS "NVFATBIN_SUCCESS" = 0
+    NVFATBIN_ERROR_INTERNAL "NVFATBIN_ERROR_INTERNAL"
+    NVFATBIN_ERROR_ELF_ARCH_MISMATCH "NVFATBIN_ERROR_ELF_ARCH_MISMATCH"
+    NVFATBIN_ERROR_ELF_SIZE_MISMATCH "NVFATBIN_ERROR_ELF_SIZE_MISMATCH"
+    NVFATBIN_ERROR_MISSING_PTX_VERSION "NVFATBIN_ERROR_MISSING_PTX_VERSION"
+    NVFATBIN_ERROR_NULL_POINTER "NVFATBIN_ERROR_NULL_POINTER"
+    NVFATBIN_ERROR_COMPRESSION_FAILED "NVFATBIN_ERROR_COMPRESSION_FAILED"
+    NVFATBIN_ERROR_COMPRESSED_SIZE_EXCEEDED "NVFATBIN_ERROR_COMPRESSED_SIZE_EXCEEDED"
+    NVFATBIN_ERROR_UNRECOGNIZED_OPTION "NVFATBIN_ERROR_UNRECOGNIZED_OPTION"
+    NVFATBIN_ERROR_INVALID_ARCH "NVFATBIN_ERROR_INVALID_ARCH"
+    NVFATBIN_ERROR_INVALID_NVVM "NVFATBIN_ERROR_INVALID_NVVM"
+    NVFATBIN_ERROR_EMPTY_INPUT "NVFATBIN_ERROR_EMPTY_INPUT"
+    NVFATBIN_ERROR_MISSING_PTX_ARCH "NVFATBIN_ERROR_MISSING_PTX_ARCH"
+    NVFATBIN_ERROR_PTX_ARCH_MISMATCH "NVFATBIN_ERROR_PTX_ARCH_MISMATCH"
+    NVFATBIN_ERROR_MISSING_FATBIN "NVFATBIN_ERROR_MISSING_FATBIN"
+    NVFATBIN_ERROR_INVALID_INDEX "NVFATBIN_ERROR_INVALID_INDEX"
+    NVFATBIN_ERROR_IDENTIFIER_REUSE "NVFATBIN_ERROR_IDENTIFIER_REUSE"
+    NVFATBIN_ERROR_INTERNAL_PTX_OPTION "NVFATBIN_ERROR_INTERNAL_PTX_OPTION"
+    _NVFATBINRESULT_INTERNAL_LOADING_ERROR "_NVFATBINRESULT_INTERNAL_LOADING_ERROR" = -42
+
+
+# types
+ctypedef void* nvFatbinHandle 'nvFatbinHandle'
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef const char* nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil
+cdef nvFatbinResult nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
new file mode 100644
index 0000000000..f0f8300cb0
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
+
+from ._internal cimport nvfatbin as _nvfatbin
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef const char* nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil:
+    return _nvfatbin._nvFatbinGetErrorString(result)
+
+
+cdef nvFatbinResult nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinCreate(handle_indirect, options, optionsCount)
+
+
+cdef nvFatbinResult nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinDestroy(handle_indirect)
+
+
+cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddPTX(handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddCubin(handle, code, size, arch, identifier)
+
+
+cdef nvFatbinResult nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddLTOIR(handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinSize(handle, size)
+
+
+cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinGet(handle, buffer)
+
+
+cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinVersion(major, minor)
+
+
+cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddReloc(handle, code, size)
+
+
+cdef nvFatbinResult nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddTileIR(handle, code, size, identifier, optionsCmdLine)
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
new file mode 100644
index 0000000000..e0744efbd7
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uint32_t
+
+from .cynvfatbin cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef nvFatbinHandle Handle
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef nvFatbinResult _Result
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef intptr_t create(options, size_t options_count) except -1
+cpdef add_ptx(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line)
+cpdef add_cubin(intptr_t handle, code, size_t size, arch, identifier)
+cpdef add_ltoir(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line)
+cpdef size_t size(intptr_t handle) except? 0
+cpdef get(intptr_t handle, buffer)
+cpdef tuple version()
+cpdef add_reloc(intptr_t handle, code, size_t size)
+cpdef add_tile_ir(intptr_t handle, code, size_t size, identifier, options_cmd_line)
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
new file mode 100644
index 0000000000..32cdeb300b
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -0,0 +1,309 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from ._internal.utils cimport (get_resource_ptr, get_nested_resource_ptr, nested_resource, nullable_unique_ptr,
+                               get_buffer_pointer, get_resource_ptrs)
+
+from enum import IntEnum as _IntEnum
+from libcpp.vector cimport vector
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Result(_IntEnum):
+    """See `nvFatbinResult`."""
+    SUCCESS = NVFATBIN_SUCCESS
+    ERROR_INTERNAL = NVFATBIN_ERROR_INTERNAL
+    ERROR_ELF_ARCH_MISMATCH = NVFATBIN_ERROR_ELF_ARCH_MISMATCH
+    ERROR_ELF_SIZE_MISMATCH = NVFATBIN_ERROR_ELF_SIZE_MISMATCH
+    ERROR_MISSING_PTX_VERSION = NVFATBIN_ERROR_MISSING_PTX_VERSION
+    ERROR_NULL_POINTER = NVFATBIN_ERROR_NULL_POINTER
+    ERROR_COMPRESSION_FAILED = NVFATBIN_ERROR_COMPRESSION_FAILED
+    ERROR_COMPRESSED_SIZE_EXCEEDED = NVFATBIN_ERROR_COMPRESSED_SIZE_EXCEEDED
+    ERROR_UNRECOGNIZED_OPTION = NVFATBIN_ERROR_UNRECOGNIZED_OPTION
+    ERROR_INVALID_ARCH = NVFATBIN_ERROR_INVALID_ARCH
+    ERROR_INVALID_NVVM = NVFATBIN_ERROR_INVALID_NVVM
+    ERROR_EMPTY_INPUT = NVFATBIN_ERROR_EMPTY_INPUT
+    ERROR_MISSING_PTX_ARCH = NVFATBIN_ERROR_MISSING_PTX_ARCH
+    ERROR_PTX_ARCH_MISMATCH = NVFATBIN_ERROR_PTX_ARCH_MISMATCH
+    ERROR_MISSING_FATBIN = NVFATBIN_ERROR_MISSING_FATBIN
+    ERROR_INVALID_INDEX = NVFATBIN_ERROR_INVALID_INDEX
+    ERROR_IDENTIFIER_REUSE = NVFATBIN_ERROR_IDENTIFIER_REUSE
+    ERROR_INTERNAL_PTX_OPTION = NVFATBIN_ERROR_INTERNAL_PTX_OPTION
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+class nvFatbinError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Result(status)
+        cdef str err = f"{s.name} ({s.value})"
+        super(nvFatbinError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef int check_status(int status) except 1 nogil:
+    if status != 0:
+        with gil:
+            raise nvFatbinError(status)
+    return status
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef destroy(intptr_t handle):
+    """nvFatbinDestroy frees the memory associated with the given handle.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+
+    .. seealso:: `nvFatbinDestroy`
+    """
+    cdef Handle h = <Handle>handle
+    with nogil:
+        status = nvFatbinDestroy(&h)
+    check_status(status)
+
+
+cpdef str get_error_string(int result):
+    """nvFatbinGetErrorString returns an error description string for each error code.
+
+    Args:
+        result (Result): error code.
+
+    .. seealso:: `nvFatbinGetErrorString`
+    """
+    cdef const char* _output_
+    cdef bytes _output_bytes_
+    _output_ = nvFatbinGetErrorString(<_Result>result)
+
+    if _output_ == NULL:
+        return ""
+
+    _output_bytes_ = <bytes>_output_
+    return _output_bytes_.decode()
+
+
+cpdef intptr_t create(options, size_t options_count) except -1:
+    """nvFatbinCreate creates a new handle.
+
+    Args:
+        options (object): An array of strings, each containing a single option. It can be:
+
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence of 'char', or
+            - a nested Python sequence of ``str``.
+
+        options_count (size_t): Number of options.
+
+    Returns:
+        intptr_t: Address of nvFatbin handle.
+
+    .. seealso:: `nvFatbinCreate`
+    """
+    cdef nested_resource[ char ] _options_
+    get_nested_resource_ptr[char](_options_, options, <char*>NULL)
+    cdef Handle handle_indirect
+    with nogil:
+        __status__ = nvFatbinCreate(&handle_indirect, <const char**>(_options_.ptrs.data()), options_count)
+    check_status(__status__)
+    return <intptr_t>handle_indirect
+
+
+cpdef add_ptx(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line):
+    """nvFatbinAddPTX adds PTX to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The PTX code.
+        size (size_t): The size of the PTX code.
+        arch (str): The numerical architecture that this PTX is for (the XX of any sm_XX, lto_XX, or compute_XX).
+        identifier (str): Name of the PTX, useful when extracting the fatbin with tools like cuobjdump.
+        options_cmd_line (str): Options used during JIT compilation.
+
+    .. seealso:: `nvFatbinAddPTX`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(arch, str):
+        raise TypeError("arch must be a Python str")
+    cdef bytes _temp_arch_ = (<str>arch).encode()
+    cdef char* _arch_ = _temp_arch_
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    if not isinstance(options_cmd_line, str):
+        raise TypeError("options_cmd_line must be a Python str")
+    cdef bytes _temp_options_cmd_line_ = (<str>options_cmd_line).encode()
+    cdef char* _options_cmd_line_ = _temp_options_cmd_line_
+    with nogil:
+        __status__ = nvFatbinAddPTX(<Handle>handle, <const char*>_code_, size, <const char*>_arch_, <const char*>_identifier_, <const char*>_options_cmd_line_)
+    check_status(__status__)
+
+
+cpdef add_cubin(intptr_t handle, code, size_t size, arch, identifier):
+    """nvFatbinAddCubin adds a CUDA binary to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The cubin.
+        size (size_t): The size of the cubin.
+        arch (str): The numerical architecture that this cubin is for (the XX of any sm_XX, lto_XX, or compute_XX).
+        identifier (str): Name of the cubin, useful when extracting the fatbin with tools like cuobjdump.
+
+    .. seealso:: `nvFatbinAddCubin`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(arch, str):
+        raise TypeError("arch must be a Python str")
+    cdef bytes _temp_arch_ = (<str>arch).encode()
+    cdef char* _arch_ = _temp_arch_
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    with nogil:
+        __status__ = nvFatbinAddCubin(<Handle>handle, <const void*>_code_, size, <const char*>_arch_, <const char*>_identifier_)
+    check_status(__status__)
+
+
+cpdef add_ltoir(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line):
+    """nvFatbinAddLTOIR adds LTOIR to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The LTOIR code.
+        size (size_t): The size of the LTOIR code.
+        arch (str): The numerical architecture that this LTOIR is for (the XX of any sm_XX, lto_XX, or compute_XX).
+        identifier (str): Name of the LTOIR, useful when extracting the fatbin with tools like cuobjdump.
+        options_cmd_line (str): Options used during JIT compilation.
+
+    .. seealso:: `nvFatbinAddLTOIR`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(arch, str):
+        raise TypeError("arch must be a Python str")
+    cdef bytes _temp_arch_ = (<str>arch).encode()
+    cdef char* _arch_ = _temp_arch_
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    if not isinstance(options_cmd_line, str):
+        raise TypeError("options_cmd_line must be a Python str")
+    cdef bytes _temp_options_cmd_line_ = (<str>options_cmd_line).encode()
+    cdef char* _options_cmd_line_ = _temp_options_cmd_line_
+    with nogil:
+        __status__ = nvFatbinAddLTOIR(<Handle>handle, <const void*>_code_, size, <const char*>_arch_, <const char*>_identifier_, <const char*>_options_cmd_line_)
+    check_status(__status__)
+
+
+cpdef size_t size(intptr_t handle) except? 0:
+    """nvFatbinSize returns the fatbinary's size.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+
+    Returns:
+        size_t: The fatbinary's size.
+
+    .. seealso:: `nvFatbinSize`
+    """
+    cdef size_t size
+    with nogil:
+        __status__ = nvFatbinSize(<Handle>handle, &size)
+    check_status(__status__)
+    return size
+
+
+cpdef get(intptr_t handle, buffer):
+    """nvFatbinGet returns the completed fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        buffer (bytes): memory to store fatbinary.
+
+    .. seealso:: `nvFatbinGet`
+    """
+    cdef void* _buffer_ = get_buffer_pointer(buffer, -1, readonly=False)
+    with nogil:
+        __status__ = nvFatbinGet(<Handle>handle, <void*>_buffer_)
+    check_status(__status__)
+
+
+cpdef tuple version():
+    """nvFatbinVersion returns the current version of nvFatbin.
+
+    Returns:
+        A 2-tuple containing:
+
+        - unsigned int: The major version.
+        - unsigned int: The minor version.
+
+    .. seealso:: `nvFatbinVersion`
+    """
+    cdef unsigned int major
+    cdef unsigned int minor
+    with nogil:
+        __status__ = nvFatbinVersion(&major, &minor)
+    check_status(__status__)
+    return (major, minor)
+
+
+cpdef add_reloc(intptr_t handle, code, size_t size):
+    """nvFatbinAddReloc adds relocatable PTX entries from a host object to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The host object image.
+        size (size_t): The size of the host object image code.
+
+    .. seealso:: `nvFatbinAddReloc`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    with nogil:
+        __status__ = nvFatbinAddReloc(<Handle>handle, <const void*>_code_, size)
+    check_status(__status__)
+
+
+cpdef add_tile_ir(intptr_t handle, code, size_t size, identifier, options_cmd_line):
+    """nvFatbinAddTileIR adds Tile IR to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The Tile IR.
+        size (size_t): The size of the Tile IR.
+        identifier (str): Name of the Tile IR, useful when extracting the fatbin with tools like cuobjdump.
+        options_cmd_line (str): Options used during JIT compilation.
+
+    .. seealso:: `nvFatbinAddTileIR`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    if not isinstance(options_cmd_line, str):
+        raise TypeError("options_cmd_line must be a Python str")
+    cdef bytes _temp_options_cmd_line_ = (<str>options_cmd_line).encode()
+    cdef char* _options_cmd_line_ = _temp_options_cmd_line_
+    with nogil:
+        __status__ = nvFatbinAddTileIR(<Handle>handle, <const void*>_code_, size, <const char*>_identifier_, <const char*>_options_cmd_line_)
+    check_status(__status__)
diff --git a/cuda_bindings/docs/source/api.rst b/cuda_bindings/docs/source/api.rst
index 4277bc745b..e6ee4b99dd 100644
--- a/cuda_bindings/docs/source/api.rst
+++ b/cuda_bindings/docs/source/api.rst
@@ -14,5 +14,6 @@ CUDA Python API Reference
    module/nvrtc
    module/nvjitlink
    module/nvvm
+   module/nvfatbin
    module/cufile
    module/utils
diff --git a/cuda_bindings/docs/source/module/nvfatbin.rst b/cuda_bindings/docs/source/module/nvfatbin.rst
new file mode 100644
index 0000000000..297d4baa85
--- /dev/null
+++ b/cuda_bindings/docs/source/module/nvfatbin.rst
@@ -0,0 +1,89 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+.. default-role:: cpp:any
+
+nvfatbin
+========
+
+Note
+----
+
+The nvfatbin bindings are not supported on nvFatbin installations <12.4. Ensure the installed CUDA toolkit's nvFatbin version is >=12.4.
+
+The Tile IR API (:func:`cuda.bindings.nvfatbin.add_tile_ir`) is only available in CUDA 13.1+.
+
+Functions
+---------
+
+NvFatbin defines the following functions for creating and populating fatbinaries.
+
+.. autofunction:: cuda.bindings.nvfatbin.create
+.. autofunction:: cuda.bindings.nvfatbin.destroy
+.. autofunction:: cuda.bindings.nvfatbin.add_ptx
+.. autofunction:: cuda.bindings.nvfatbin.add_cubin
+.. autofunction:: cuda.bindings.nvfatbin.add_ltoir
+.. autofunction:: cuda.bindings.nvfatbin.add_reloc
+.. autofunction:: cuda.bindings.nvfatbin.add_tile_ir
+.. autofunction:: cuda.bindings.nvfatbin.size
+.. autofunction:: cuda.bindings.nvfatbin.get
+.. autofunction:: cuda.bindings.nvfatbin.get_error_string
+.. autofunction:: cuda.bindings.nvfatbin.version
+
+Types
+---------
+.. autoclass:: cuda.bindings.nvfatbin.Result
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.SUCCESS
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INTERNAL
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_ELF_ARCH_MISMATCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_ELF_SIZE_MISMATCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_MISSING_PTX_VERSION
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_NULL_POINTER
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_COMPRESSION_FAILED
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_COMPRESSED_SIZE_EXCEEDED
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_UNRECOGNIZED_OPTION
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INVALID_ARCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INVALID_NVVM
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_EMPTY_INPUT
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_MISSING_PTX_ARCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_PTX_ARCH_MISMATCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_MISSING_FATBIN
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INVALID_INDEX
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_IDENTIFIER_REUSE
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INTERNAL_PTX_OPTION
+
diff --git a/cuda_bindings/docs/source/release/13.1.X-notes.rst b/cuda_bindings/docs/source/release/13.1.X-notes.rst
index 21323682b1..92a3f8dbcb 100644
--- a/cuda_bindings/docs/source/release/13.1.X-notes.rst
+++ b/cuda_bindings/docs/source/release/13.1.X-notes.rst
@@ -9,6 +9,8 @@
 Highlights
 ----------
 
+* Add ``nvFatbin`` bindings. (PR #1467 <https://github.com/NVIDIA/cuda-python/pull/1467>_)
+
 Experimental
 ------------
 
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 7c4bddb434..614f7bb63a 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = ["cuda-pathfinder ~=1.1"]
 
 [project.optional-dependencies]
 all = [
-    "cuda-toolkit[nvrtc,nvjitlink,nvvm]==13.*",
+    "cuda-toolkit[nvrtc,nvjitlink,nvvm,nvfatbin]==13.*",
     "cuda-toolkit[cufile]==13.*; sys_platform == 'linux'",
 ]
 
diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
new file mode 100644
index 0000000000..c3b25db2f2
--- /dev/null
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -0,0 +1,321 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+import base64
+import binascii
+import shutil
+import subprocess
+
+import pytest
+from cuda.bindings import nvfatbin, nvrtc
+
+ARCHITECTURES = ["sm_75", "sm_80", "sm_90", "sm_100"]
+PTX_VERSIONS = ["6.4", "7.0", "8.5", "8.8"]
+
+PTX_TEMPLATE = """
+.version {PTX_VERSION}
+.target {ARCH}
+.address_size 64
+
+        // .globl       _Z6kernelPi
+
+.visible .entry _Z6kernelPi(
+        .param .u64 _Z6kernelPi_param_0
+)
+{{
+        .reg .b32       %r<7>;
+        .reg .b64       %rd<5>;
+
+
+        ld.param.u64    %rd1, [_Z6kernelPi_param_0];
+        cvta.to.global.u64      %rd2, %rd1;
+        mov.u32         %r1, %tid.x;
+        mov.u32         %r2, %ctaid.x;
+        mov.u32         %r3, %ntid.x;
+        mad.lo.s32      %r4, %r2, %r3, %r1;
+        mul.wide.s32    %rd3, %r4, 4;
+        add.s64         %rd4, %rd2, %rd3;
+        ld.global.u32   %r5, [%rd4];
+        add.s32         %r6, %r5, 1;
+        st.global.u32   [%rd4], %r6;
+        ret;
+
+}}
+"""
+
+CODE = """
+int __device__ inc(int x) {
+    return x + 1;
+}
+"""
+
+# Base64 encoded TileIR generated by the toolshed/dump_cutile_b64.py script.
+TILEIR_b64 = (
+    "f1RpbGVJUgANAQAAgo0BCAECBgYBCwEECgCBAUQHBgQIEAAABgUMAQABBgUIEAALQwEICgEMAAYE"
+    "CBAAAwYFDAEABAYFCBAAD0MBCA4BEAAGBAgQAAYGBQwBAAcGBQgQABNDAQgSARQAMAUFBUIJDT4C"
+    "CgcEABkBFglCCRE+AgoHBAAcARYJAgoAABodQgkVZgEHBAAfIAEWCVwAAIQICADLy8vLy8vLg5oC"
+    "CMvLy8sBy8vLAAAAABfLy8vLy8vLBAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAA"
+    "AAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAA"
+    "AAAABAAAAAAAAAAEAAAAAAAAAAUAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAcAAAAAAAAABwAAAAAA"
+    "AAAIAAAAAAAAAAkAAAAAAAAACQAAAAAAAAAEAAAAAAAAAAnLy8sAAAAAAwAAAAUAAAAMAAAAEQAA"
+    "ABYAAAAbAAAAIAAAACUAAAACAAEBAQUBFwICAhcEAwMYBAQDAxkTBAMDGhEEAwMbEQQDAx0WBAMD"
+    "HgiFdATLy8sLy8vLAAAAAAEAAAACAAAAAwAAAAUAAAAIAAAACwAAABcAAAAYAAAALAAAADkAAAAA"
+    "AwcMAg0DAA0BABAJBAUFBAUFBAUFABEOAgEAAAAAAAAAgAEBAAAAAAAAAA8BEAAAAAgBAAAAAAAN"
+    "AgEQAAAAAAAAAIGIAQQFy8vLAAAAABIAAAAsAAAAPQAAAGoAAABkdW1wX2N1dGlsZV9iNjQucHkv"
+    "bG9jYWxob21lL2xvY2FsLXdhbmdtL3RveXZlY3Rvcl9hZGRfa2VybmVsL2xvY2FsaG9tZS9sb2Nh"
+    "bC13YW5nbS90b3kvZHVtcF9jdXRpbGVfYjY0LnB5c21fMTIwAA=="
+)
+
+
+def get_version() -> tuple[int, int]:
+    return nvfatbin.version()
+
+
+@pytest.fixture(params=ARCHITECTURES)
+def arch(request):
+    return request.param
+
+
+@pytest.fixture(params=PTX_VERSIONS)
+def ptx_version(request):
+    return request.param
+
+
+@pytest.fixture
+def PTX(arch, ptx_version):
+    return PTX_TEMPLATE.format(PTX_VERSION=ptx_version, ARCH=arch)
+
+
+@pytest.fixture
+def nvcc_smoke(tmpdir) -> str:
+    # TODO: Use cuda-pathfinder to locate nvcc on system.
+    nvcc = shutil.which("nvcc")
+    if nvcc is None:
+        pytest.skip("nvcc not found on PATH")
+
+    # Smoke test: make sure nvcc is actually usable (toolkit + host compiler are set up),
+    # not merely present on PATH.
+    src = tmpdir / "nvcc_smoke.cu"
+    out = tmpdir / "nvcc_smoke.o"
+    with open(src, "w") as f:
+        f.write("")
+    try:
+        subprocess.run(  # noqa: S603
+            [nvcc, "-c", str(src), "-o", str(out)],
+            check=True,
+            capture_output=True,
+        )
+    except subprocess.CalledProcessError as e:
+        stdout = (e.stdout or b"").decode(errors="replace")
+        stderr = (e.stderr or b"").decode(errors="replace")
+        pytest.skip(
+            "nvcc found on PATH but failed to compile a trivial input.\n"
+            f"command: {[nvcc, '-c', str(src), '-o', str(out)]!r}\n"
+            f"exit_code: {e.returncode}\n"
+            f"stdout:\n{stdout}\n"
+            f"stderr:\n{stderr}\n"
+        )
+
+    return nvcc
+
+
+@pytest.fixture
+def CUBIN(arch):
+    def CHECK_NVRTC(err):
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(repr(err))
+
+    err, program_handle = nvrtc.nvrtcCreateProgram(CODE.encode(), b"", 0, [], [])
+    CHECK_NVRTC(err)
+    err = nvrtc.nvrtcCompileProgram(program_handle, 1, [f"-arch={arch}".encode()])[0]
+    CHECK_NVRTC(err)
+    err, size = nvrtc.nvrtcGetCUBINSize(program_handle)
+    CHECK_NVRTC(err)
+    cubin = b" " * size
+    (err,) = nvrtc.nvrtcGetCUBIN(program_handle, cubin)
+    CHECK_NVRTC(err)
+    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
+    CHECK_NVRTC(err)
+    return cubin
+
+
+# create a valid LTOIR input for testing
+@pytest.fixture
+def LTOIR(arch):
+    arch = arch.replace("sm", "compute")
+
+    def CHECK_NVRTC(err):
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(repr(err))
+
+    empty_cplusplus_kernel = "__global__ void A() {}"
+    err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], [])
+    CHECK_NVRTC(err)
+    err = nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto", f"-arch={arch}".encode()])[0]
+    CHECK_NVRTC(err)
+    err, size = nvrtc.nvrtcGetLTOIRSize(program_handle)
+    CHECK_NVRTC(err)
+    empty_kernel_ltoir = b" " * size
+    (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir)
+    CHECK_NVRTC(err)
+    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
+    CHECK_NVRTC(err)
+    return empty_kernel_ltoir
+
+
+@pytest.fixture
+def OBJECT(arch, tmpdir, nvcc_smoke):
+    empty_cplusplus_kernel = "__global__ void A() {}"
+    with open(tmpdir / "object.cu", "w") as f:
+        f.write(empty_cplusplus_kernel)
+
+    nvcc = nvcc_smoke
+
+    # This is a test fixture that intentionally invokes a trusted tool (`nvcc`) to
+    # compile a temporary CUDA translation unit.
+    cmd = [nvcc, "-c", "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")]
+    try:
+        subprocess.run(  # noqa: S603
+            cmd,
+            check=True,
+            capture_output=True,
+        )
+    except subprocess.CalledProcessError as e:
+        stdout = (e.stdout or b"").decode(errors="replace")
+        stderr = (e.stderr or b"").decode(errors="replace")
+        raise RuntimeError(
+            "nvcc smoke test passed, but nvcc failed while compiling the test object.\n"
+            f"command: {cmd!r}\n"
+            f"exit_code: {e.returncode}\n"
+            f"stdout:\n{stdout}\n"
+            f"stderr:\n{stderr}\n"
+        ) from e
+    with open(tmpdir / "object.o", "rb") as f:
+        object = f.read()
+
+    return object
+
+
+@pytest.fixture
+def TILEIR(tmpdir):
+    try:
+        binary_data = base64.b64decode(TILEIR_b64)
+    except binascii.Error as e:
+        raise ValueError(
+            "Base64 encoded TileIR is corrupted. Please regenerate the TileIR"
+            "by executing the toolshed/dump_cutile_b64.py script."
+        ) from e
+    return binary_data
+
+
+@pytest.mark.parametrize("error_enum", nvfatbin.Result)
+def test_get_error_string(error_enum):
+    es = nvfatbin.get_error_string(error_enum)
+
+    if error_enum is nvfatbin.Result.SUCCESS:
+        assert es == ""
+    else:
+        assert es != ""
+
+
+def test_nvfatbin_get_version():
+    major, minor = nvfatbin.version()
+    assert major is not None
+    assert minor is not None
+
+
+def test_nvfatbin_empty_create_and_destroy():
+    handle = nvfatbin.create([], 0)
+    assert handle is not None
+    nvfatbin.destroy(handle)
+
+
+def test_nvfatbin_invalid_input_create():
+    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvfatbin.create(["--unsupported_option"], 1)
+
+
+def test_nvfatbin_get_empty():
+    handle = nvfatbin.create([], 0)
+    size = nvfatbin.size(handle)
+
+    buffer = bytearray(size)
+    nvfatbin.get(handle, buffer)
+
+    nvfatbin.destroy(handle)
+
+
+def test_nvfatbin_add_ptx(PTX, arch):
+    arch_numeric = arch.split("_")[1]
+
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_ptx(handle, PTX.encode(), len(PTX), arch_numeric, "add", f"-arch={arch}")
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
+
+
+@pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
+def test_nvfatbin_add_cubin_ELF_SIZE_MISMATCH(CUBIN, arch):
+    handle = nvfatbin.create([], 0)
+    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+        nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), "75", "inc")
+
+    nvfatbin.destroy(handle)
+
+
+def test_nvfatbin_add_cubin(CUBIN, arch):
+    arch_numeric = arch.split("_")[1]
+
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), arch_numeric, "inc")
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
+
+
+@pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
+def test_nvfatbin_add_cubin_ELF_ARCH_MISMATCH(CUBIN, arch):
+    handle = nvfatbin.create([], 0)
+    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+        nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), "75", "inc")
+
+    nvfatbin.destroy(handle)
+
+
+def test_nvdfatbin_add_ltoir(LTOIR, arch):
+    arch_numeric = arch.split("_")[1]
+
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_ltoir(handle, LTOIR, len(LTOIR), arch_numeric, "inc", "")
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
+
+
+def test_nvfatbin_add_reloc(OBJECT):
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_reloc(handle, OBJECT, len(OBJECT))
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
+
+
+@pytest.mark.skipif(get_version() < (13, 1), reason="TileIR API is not supported in CUDA < 13.1")
+def test_nvfatbin_add_tile_ir(TILEIR):
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_tile_ir(handle, TILEIR, len(TILEIR), "VectorAdd", "")
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
diff --git a/toolshed/dump_cutile_b64.py b/toolshed/dump_cutile_b64.py
new file mode 100644
index 0000000000..4ce5a82a9f
--- /dev/null
+++ b/toolshed/dump_cutile_b64.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+"""
+Embeds a sample cuTile kernel, executes it with CUDA_TILE_DUMP_BYTECODE=.,
+loads the resulting .cutile file, and prints its base64-encoded content.
+"""
+
+import base64
+import glob
+import os
+import sys
+
+import cuda.tile as ct
+import cupy
+
+
+def _run_sample_cutile_kernel() -> None:
+    # Import after env var setup so CUDA_TILE_DUMP_BYTECODE is honored.
+    TILE_SIZE = 16
+
+    @ct.kernel
+    def vector_add_kernel(a, b, result):
+        block_id = ct.bid(0)
+        a_tile = ct.load(a, index=(block_id,), shape=(TILE_SIZE,))
+        b_tile = ct.load(b, index=(block_id,), shape=(TILE_SIZE,))
+
+        result_tile = a_tile + b_tile
+        ct.store(result, index=(block_id,), tile=result_tile)
+
+    a = cupy.arange(128, dtype="float32")
+    b = cupy.arange(128, dtype="float32")
+    result = cupy.zeros_like(a)
+
+    grid = (ct.cdiv(a.shape[0], TILE_SIZE), 1, 1)
+    ct.launch(cupy.cuda.get_current_stream(), grid, vector_add_kernel, (a, b, result))
+
+    cupy.cuda.get_current_stream().synchronize()
+
+    assert result[-1] == 254
+
+
+def main():
+    # CUDA_TILE_DUMP_BYTECODE=. means dump to current directory
+    os.environ["CUDA_TILE_DUMP_BYTECODE"] = "."
+
+    try:
+        _run_sample_cutile_kernel()
+    except Exception as e:
+        print(f"Sample kernel execution failed: {e}", file=sys.stderr)
+        raise
+
+    # Find the .cutile file in current directory
+    cutile_files = glob.glob("./*.cutile")
+    if not cutile_files:
+        print("No .cutile file found in current directory", file=sys.stderr)
+        sys.exit(1)
+
+    # Use the most recently modified one if multiple exist
+    cutile_path = max(cutile_files, key=os.path.getmtime)
+
+    # Read the binary content
+    with open(cutile_path, "rb") as f:
+        binary_content = f.read()
+
+    # Encode with base64 in ASCII mode
+    b64_encoded = base64.b64encode(binary_content).decode("ascii")
+
+    # Print with lines less than 79 characters, wrapped with quotes
+    line_width = 76  # 78 - 2 for the quotes on both sides
+    for i in range(0, len(b64_encoded), line_width):
+        chunk = b64_encoded[i : i + line_width]
+        print(f'"{chunk}"')
+
+
+if __name__ == "__main__":
+    main()