diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..9937b0283
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,88 @@
+---
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: false
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BreakAfterJavaFieldAnnotations: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+BreakStringLiterals: true
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+FixNamespaceComments: true
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+# Lower Priority numbers sort first (Priority 1 appears before Priority 2, etc.).
+# List order matches priority order (1 → 2 → 3); torch/pybind11 must be listed before the
+# broader '^<' rule so it is matched first.
+# Result: torch/pybind11 (1) → system/third-party (2) → project-local headers (3).
+# Add graphlearn_torch to the Priority 1 regex when GLT headers appear in the codebase.
+IncludeCategories:
+  - Regex:           '^<(torch|pybind11)/'
+    Priority:        1
+  - Regex:           '^(<|"gtest/)'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '^$'
+IndentCaseLabels: true
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 10000000
+PointerAlignment: Left
+RawStringFormats:
+  - Delimiters:       [pb]
+    Language:        TextProto
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        c++17
+TabWidth:        4
+UseTab:          Never
+...
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..5dd99dd83
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,236 @@
+---
+# -bugprone-implicit-widening-of-multiplication-result is disabled because it
+# crashes clang-tidy 15 on a construct in ATen/core/dynamic_type.h (upstream
+# LLVM bug). Re-enable when upgrading past clang-tidy 15.
+# All other disabled checks are documented in docs/cpp_style_guide.md.
+Checks: >
+  boost-use-to-string,
+  bugprone-*,
+  -bugprone-easily-swappable-parameters,
+  -bugprone-implicit-widening-of-multiplication-result,
+  -bugprone-narrowing-conversions,
+  cert-err34-c,
+  cert-flp30-c,
+  cert-msc32-c,
+  cert-msc50-cpp,
+  cert-msc51-cpp,
+  clang-diagnostic-*,
+  cppcoreguidelines-interfaces-global-init,
+  cppcoreguidelines-no-malloc,
+  cppcoreguidelines-pro-type-static-cast-downcast,
+  cppcoreguidelines-pro-type-union-access,
+  cppcoreguidelines-slicing,
+  google-build-namespaces,
+  google-explicit-constructor,
+  google-global-names-in-headers,
+  google-readability-casting,
+  google-runtime-member-string-references,
+  google-runtime-memset,
+  hicpp-exception-baseclass,
+  misc-*,
+  -misc-confusable-identifiers,
+  -misc-const-correctness,
+  -misc-no-recursion,
+  modernize-*,
+  -modernize-avoid-c-arrays,
+  -modernize-use-trailing-return-type,
+  performance-*,
+  readability-*,
+  -readability-avoid-const-params-in-decls,
+  -readability-function-cognitive-complexity,
+  -readability-container-contains,
+  -readability-identifier-length,
+  -readability-magic-numbers,
+
+# WarningsAsErrors and HeaderFilterRegex work together:
+# HeaderFilterRegex scopes which headers clang-tidy reports diagnostics for.
+# Warnings from headers outside the regex (PyTorch, pybind11, etc.) are suppressed
+# entirely and never reach WarningsAsErrors — so the large warning counts printed
+# by clang-tidy ("N warnings generated") are third-party noise that is silently
+# dropped. Only diagnostics in our own headers (.*/gigl/csrc/.*) are reported,
+# and those are treated as hard errors.
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*/gigl-core/csrc/.*'
+FormatStyle:     none
+# CheckOptions: per-check tuning parameters. Each entry configures a specific
+# option for an individual check, using the form:
+#   key:   <check-name>.<OptionName>
+#   value: <value>
+# These let you adjust thresholds, naming patterns, and behavior without
+# enabling or disabling the check entirely.
+CheckOptions:
+  - key:             bugprone-argument-comment.StrictMode
+    value:           '0'
+  - key:             bugprone-assert-side-effect.AssertMacros
+    value:           'assert,SC_ASSERT'
+  - key:             bugprone-assert-side-effect.CheckFunctionCalls
+    value:           '0'
+  - key:             bugprone-dangling-handle.HandleClasses
+    value:           'std::basic_string_view;std::experimental::basic_string_view'
+  - key:             bugprone-string-constructor.LargeLengthThreshold
+    value:           '8388608'
+  - key:             bugprone-string-constructor.WarnOnLargeLength
+    value:           '1'
+  - key:             cppcoreguidelines-pro-type-member-init.IgnoreArrays
+    value:           '1'
+  - key:             google-global-names-in-headers.HeaderFileExtensions
+    value:           ',h,hh,hpp,hxx'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+  - key:             misc-definitions-in-headers.HeaderFileExtensions
+    value:           ',h,hh,hpp,hxx'
+  - key:             misc-definitions-in-headers.UseHeaderFileExtension
+    value:           '1'
+  - key:             misc-misplaced-widening-cast.CheckImplicitCasts
+    value:           '0'
+  - key:             misc-sizeof-expression.WarnOnSizeOfCompareToConstant
+    value:           '1'
+  - key:             misc-sizeof-expression.WarnOnSizeOfConstant
+    value:           '1'
+  - key:             misc-sizeof-expression.WarnOnSizeOfThis
+    value:           '1'
+  - key:             misc-suspicious-enum-usage.StrictMode
+    value:           '0'
+  - key:             misc-suspicious-missing-comma.MaxConcatenatedTokens
+    value:           '5'
+  - key:             misc-suspicious-missing-comma.RatioThreshold
+    value:           '0.200000'
+  - key:             misc-suspicious-missing-comma.SizeThreshold
+    value:           '5'
+  - key:             misc-suspicious-string-compare.StringCompareLikeFunctions
+    value:           ''
+  - key:             misc-suspicious-string-compare.WarnOnImplicitComparison
+    value:           '1'
+  - key:             misc-suspicious-string-compare.WarnOnLogicalNotComparison
+    value:           '0'
+  - key:             misc-throw-by-value-catch-by-reference.CheckThrowTemporaries
+    value:           '1'
+  - key:             modernize-loop-convert.MaxCopySize
+    value:           '16'
+  - key:             modernize-loop-convert.MinConfidence
+    value:           reasonable
+  - key:             modernize-loop-convert.NamingStyle
+    value:           camelBack
+  - key:             modernize-loop-convert.UseCxx20ReverseRanges
+    value:           '0'
+  - key:             modernize-make-unique.IgnoreMacros
+    value:           '1'
+  - key:             modernize-make-unique.IncludeStyle
+    value:           'llvm'
+  - key:             modernize-make-unique.MakeSmartPtrFunction
+    value:           'std::make_unique'
+  - key:             modernize-make-unique.MakeSmartPtrFunctionHeader
+    value:           memory
+  - key:             modernize-pass-by-value.IncludeStyle
+    value:           llvm
+  - key:             modernize-replace-auto-ptr.IncludeStyle
+    value:           llvm
+  - key:             modernize-use-emplace.ContainersWithPushBack
+    value:           '::std::vector;::std::list;::std::deque'
+  - key:             modernize-use-emplace.SmartPointers
+    value:           '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr'
+  - key:             modernize-use-emplace.TupleMakeFunctions
+    value:           '::std::make_pair;::std::make_tuple'
+  - key:             modernize-use-emplace.TupleTypes
+    value:           '::std::pair;::std::tuple'
+  - key:             modernize-use-noexcept.ReplacementString
+    value:           ''
+  - key:             modernize-use-noexcept.UseNoexceptFalse
+    value:           '1'
+  - key:             modernize-use-nullptr.NullMacros
+    value:           'NULL'
+  - key:             modernize-use-transparent-functors.SafeMode
+    value:           '0'
+  - key:             performance-faster-string-find.StringLikeClasses
+    value:           'std::basic_string'
+  - key:             performance-for-range-copy.WarnOnAllAutoCopies
+    value:           '0'
+  - key:             performance-inefficient-string-concatenation.StrictMode
+    value:           '0'
+  - key:             performance-inefficient-vector-operation.VectorLikeClasses
+    value:           '::std::vector'
+  - key:             performance-move-const-arg.CheckTriviallyCopyableMove
+    value:           '1'
+  - key:             performance-move-constructor-init.IncludeStyle
+    value:           llvm
+  - key:             performance-type-promotion-in-math-fn.IncludeStyle
+    value:           llvm
+  - key:             readability-braces-around-statements.ShortStatementLines
+    value:           '0'
+  # BranchThreshold, NestingThreshold, and ParameterThreshold are set to UINT32_MAX
+  # to effectively disable these sub-checks. GNN/ML kernels legitimately have deep
+  # nesting (loops over nodes, edges, and features) and many parameters (model configs,
+  # hyperparameters), so enforcing these limits would generate noise on valid code.
+  - key:             readability-function-size.BranchThreshold
+    value:           '4294967295'
+  - key:             readability-function-size.LineThreshold
+    value:           '1000'
+  - key:             readability-function-size.NestingThreshold
+    value:           '4294967295'
+  - key:             readability-function-size.ParameterThreshold
+    value:           '4294967295'
+  - key:             readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ClassConstantPrefix
+    value:           k
+  - key:             readability-identifier-naming.ClassConstantCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ClassMemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ConstexprVariableCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ConstexprVariablePrefix
+    value:           k
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumConstantCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           camelBack
+  - key:             readability-identifier-naming.GlobalConstantPrefix
+    value:           k
+  - key:             readability-identifier-naming.GlobalConstantCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.IgnoreFailedSplit
+    value:           '0'
+  - key:             readability-identifier-naming.LocalConstantCase
+    value:           camelBack
+  - key:             readability-identifier-naming.MemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.MethodCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ParameterCase
+    value:           camelBack
+  - key:             readability-identifier-naming.PrivateMemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.PrivateMemberPrefix
+    value:           _
+  - key:             readability-identifier-naming.ProtectedMemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ProtectedMemberPrefix
+    value:           _
+  - key:             readability-identifier-naming.PublicMemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.TemplateParameterCase
+    value:           camelBack
+  - key:             readability-identifier-naming.TypeTemplateParameterCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           camelBack
+  - key:             readability-implicit-bool-conversion.AllowPointerConditions
+    value:           '1'
+  - key:             readability-simplify-boolean-expr.ChainedConditionalAssignment
+    value:           '0'
+  - key:             readability-simplify-boolean-expr.ChainedConditionalReturn
+    value:           '0'
+  - key:             readability-static-accessed-through-instance.NameSpecifierNestingThreshold
+    value:           '3'
+...
diff --git a/.clangd b/.clangd
new file mode 100644
index 000000000..d1812104f
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,2 @@
+CompileFlags:
+  CompilationDatabase: gigl-core/.cache/cmake_build
diff --git a/.github/cloud_builder/run_command_on_active_checkout.yaml b/.github/cloud_builder/run_command_on_active_checkout.yaml
index d0e407f02..d99c024a3 100644
--- a/.github/cloud_builder/run_command_on_active_checkout.yaml
+++ b/.github/cloud_builder/run_command_on_active_checkout.yaml
@@ -3,7 +3,7 @@ substitutions:
 options:
   logging: CLOUD_LOGGING_ONLY
 steps:
-  - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:b34c863a2168c8df5a6da1f6385e5d374f0175d2.91.1
+  - name: us-central1-docker.pkg.dev/external-snap-ci-github-gigl/gigl-base-images/gigl-builder:b598f3d72eee47f5513dcb39460944459a0a012f.108.1
     entrypoint: /bin/bash
     args:
       - -c
diff --git a/.github/workflows/on-pr-comment.yml b/.github/workflows/on-pr-comment.yml
index 870c415fe..90ea5ef32 100644
--- a/.github/workflows/on-pr-comment.yml
+++ b/.github/workflows/on-pr-comment.yml
@@ -64,6 +64,25 @@ jobs:
         command: |
           make unit_test_py
 
+  unit-test-cpp:
+    if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_cpp') || endsWith(github.event.comment.body, '/unit_test') || contains(github.event.comment.body, '/all_test')) }}
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+    - name: Run C++ Unit Tests
+      uses: snapchat/gigl/.github/actions/run-command-on-pr@main
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        pr_number: ${{ github.event.issue.number }}
+        should_leave_progress_comments: "true"
+        descriptive_workflow_name: "C++ Unit Test"
+        use_cloud_run: "false"
+        command: |
+          bash gigl-core/requirements/install_cpp_deps.sh
+          bash requirements/install_py_deps.sh --skip-glt-post-install
+          export PATH="$HOME/.local/bin:$PATH"
+          make unit_test_cpp
+
   unit-test-scala:
     if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_scala') || endsWith(github.event.comment.body, '/unit_test') || contains(github.event.comment.body, '/all_test')) }}
     runs-on: ubuntu-latest
diff --git a/.github/workflows/on-pr-merge.yml b/.github/workflows/on-pr-merge.yml
index 0e1f9ddd0..38663f0c3 100644
--- a/.github/workflows/on-pr-merge.yml
+++ b/.github/workflows/on-pr-merge.yml
@@ -70,6 +70,21 @@ jobs:
         service_account:  ${{ secrets.gcp_service_account_email }}
         project:  ${{ vars.GCP_PROJECT_ID }}
 
+  ci-unit-test-cpp:
+    if: github.event_name == 'merge_group'
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install C++ dependencies
+      run: bash gigl-core/requirements/install_cpp_deps.sh
+    - name: Set up Python tools
+      uses: ./.github/actions/setup-python-tools
+    - name: Install Python dependencies
+      run: bash requirements/install_py_deps.sh --skip-glt-post-install
+    - name: Run C++ Unit Tests
+      run: make unit_test_cpp
+
   ci-integration-test:
     if: github.event_name == 'merge_group'
     runs-on: ubuntu-latest
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 777e68183..3325d1c98 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -12,8 +12,17 @@ permissions:
 
 jobs:
   build:
-    name: Build and release pip whl
-    runs-on: ubuntu-latest
+    name: Build and release pip whl (${{ matrix.torch-variant }})
+    strategy:
+      matrix:
+        include:
+          - runner: ubuntu-latest
+            torch-variant: cpu
+            publish-url: https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl
+          - runner: gigl-gpu-instances
+            torch-variant: cu128
+            publish-url: https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl-cu128
+    runs-on: ${{ matrix.runner }}
     env:
       PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
     environment:
@@ -31,6 +40,7 @@ jobs:
           gcp_project_id: ${{ vars.GCP_PROJECT_ID }}
           workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}
           gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}
+
     # We need build and twine to build the whl and upload it to Google Artifact Registry.
     # keyrings.google-artifactregistry-auth is needed to authenticate with Google Artifact Registry.
     # See: https://cloud.google.com/artifact-registry/docs/python/store-python
@@ -40,20 +50,28 @@ jobs:
         # Pre-install keyring and Artifact Registry plugin from the public PyPI
         uv tool install keyring --with keyrings.google-artifactregistry-auth==1.1.2
 
-    - name: Build Whl Distribution
-      run: uv build
+    # gigl-core contains compiled C++/CUDA extensions that are ABI-bound to the
+    # torch variant. Build and publish one wheel per variant.
+    # Build isolation is disabled via no-build-isolation-package in pyproject.toml
+    # so cmake can find torch from the ambient environment (not on PyPI).
+    # scikit-build-core is declared in the build-backend dependency group and installed by uv.
+    - name: Build gigl-core wheel
+      run: |
+        # Remove stale cmake cache from previous runs on self-hosted runners.
+        rm -rf gigl-core/.cache/cmake_build
+        uv build --wheel gigl-core/ --locked
 
-    - name: Publish Package 🚀
-      env:
-        PYPIRC_CONTENTS: ${{ secrets.PYPIRC_CONTENTS }}
-        PIP_CONF_CONTENTS: ${{ secrets.PIP_CONF_CONTENTS }}
-      # We upload the build whls to Google Artifact Registry.
+    - name: Publish gigl-core wheel
+      working-directory: gigl-core
       run: |
-        uv publish --index gcp-release-registry --username oauth2accesstoken --keyring-provider subprocess
+        uv publish --publish-url ${{ matrix.publish-url }} --username oauth2accesstoken --keyring-provider subprocess
 
-    - name: Post Publish Package
-      if: always()
-      # Clean up files created during Publish Package step.
+    # gigl is pure Python — build is fast and produces the same wheel on both
+    # runners. Publish to each variant's registry so the registry is
+    # self-contained: a user only needs one GCP extra-index URL per variant.
+    # gigl-core is published first so its dependency is already in the registry
+    # when gigl becomes available.
+    - name: Build and publish gigl wheel
       run: |
-        rm -rf ~/.pypirc
-        rm -rf ~/.pip/pip.conf
+        uv build --wheel --locked
+        uv publish --publish-url ${{ matrix.publish-url }} --username oauth2accesstoken --keyring-provider subprocess
diff --git a/.gitignore b/.gitignore
index 26bb8b2e8..8bbaf1d0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,5 +49,8 @@ fossa*.zip
 # https://github.com/google-github-actions/auth/issues/497
 gha-creds-*.json
 
+# Compiled C++ extension modules
+gigl-core/**/*.so
+
 # Local-only scripts with hardcoded internal identifiers
 scripts/_local/
diff --git a/Makefile b/Makefile
index dc3accd94..9db02bb80 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,10 @@ DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG?=${DOCKER_IMAGE_MAIN_CPU_NAME}:${DATE}
 DOCKER_IMAGE_DEV_WORKBENCH_NAME_WITH_TAG?=${DOCKER_IMAGE_DEV_WORKBENCH_NAME}:${DATE}
 
 PYTHON_DIRS:=.github/scripts examples gigl tests snapchat scripts
+CPP_SOURCES:=$(shell find gigl-core/csrc \( -name "*.cpp" -o -name "*.cu" \) 2>/dev/null)
+# clang-tidy 15 does not fully support CUDA syntax (e.g. <<<...>>>, __global__).
+# Exclude .cu files from tidy targets; clang-format and clangd handle them fine.
+CPP_SOURCES_NO_CUDA:=$(filter-out %.cu,$(CPP_SOURCES))
 PY_TEST_FILES?="*_test.py"
 # You can override GIGL_TEST_DEFAULT_RESOURCE_CONFIG by setting it in your environment i.e.
 # adding `export GIGL_TEST_DEFAULT_RESOURCE_CONFIG=your_resource_config` to your shell config (~/.bashrc, ~/.zshrc, etc.)
@@ -47,6 +51,7 @@ check_if_valid_env:
 # if developing, you need to install dev deps instead
 install_dev_deps: check_if_valid_env
 	gcloud auth configure-docker us-central1-docker.pkg.dev
+	bash ./gigl-core/requirements/install_cpp_deps.sh
 	bash ./requirements/install_py_deps.sh --dev
 	bash ./requirements/install_scala_deps.sh
 	uv pip install -e .
@@ -75,7 +80,7 @@ assert_yaml_configs_parse:
 # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"`
 # By default, runs all tests under tests/unit.
 # See the help text for "--test_file_pattern" in tests/test_args.py for more details.
-unit_test_py: clean_build_files_py type_check
+unit_test_py: clean_build_files_py build_cpp_extensions type_check
 	uv run python -m tests.unit.main \
 		--env=test \
 		--resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \
@@ -94,7 +99,10 @@ unit_test_scala: clean_build_files_scala
 # Eventually, we should look into splitting these up.
 # We run `make check_format` separately instead of as a dependent make rule so that it always runs after the actual testing.
 # We don't want to fail the tests due to non-conformant formatting during development.
-unit_test: precondition_tests unit_test_py unit_test_scala
+unit_test_cpp:
+	$(MAKE) -C gigl-core unit_test_cpp
+
+unit_test: precondition_tests unit_test_py unit_test_scala unit_test_cpp
 
 check_format_py:
 	uv run ruff check --config pyproject.toml ${PYTHON_DIRS}
@@ -108,13 +116,18 @@ check_format_md:
 	@echo "Checking markdown files..."
 	uv run mdformat --check ${MD_FILES}
 
-check_format: check_format_py check_format_scala check_format_md
+check_format_cpp:
+	$(MAKE) -C gigl-core check_format_cpp
+
+# Checks formatting only (clang-format, black, scalafmt, mdformat). Does NOT run
+# clang-tidy static analysis — use `make check_lint_cpp` for that.
+check_format: check_format_py check_format_cpp check_format_scala check_format_md
 
 # Set PY_TEST_FILES=<TEST_FILE_NAME_GLOB> to test a specifc file.
 # Ex. `make integration_test PY_TEST_FILES="dataflow_test.py"`
 # By default, runs all tests under tests/integration.
 # See the help text for "--test_file_pattern" in tests/test_args.py for more details.
-integration_test:
+integration_test: build_cpp_extensions
 	uv run python -m tests.integration.main \
 		--env=test \
 		--resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \
@@ -141,14 +154,40 @@ format_md:
 	@echo "Formatting markdown files..."
 	uv run mdformat ${MD_FILES}
 
-format: format_py format_scala format_md
+format_cpp:
+	$(MAKE) -C gigl-core format_cpp
+
+format: format_py format_cpp format_scala format_md
 
 type_check:
 	uv run mypy ${PYTHON_DIRS} --check-untyped-defs
 
-lint_test: check_format assert_yaml_configs_parse
+build_cpp_extensions:
+	$(MAKE) -C gigl-core build_cpp_extensions
+
+check_lint_cpp: build_cpp_extensions
+	$(if $(CPP_SOURCES_NO_CUDA),uv run python -m scripts.run_cpp_lint $(CPP_SOURCES_NO_CUDA))
+
+# Not part of `make format`: clang-tidy --fix rewrites logic (renames identifiers,
+# changes expressions, adds/removes keywords), not just style. Run manually and
+# review the diff before committing. Note: --fix cannot auto-repair every check;
+# some violations require manual edits.
+# --extra-arg=-Wno-ignored-optimization-argument suppresses GCC-specific LTO flags
+# (-fno-fat-lto-objects, -flto=auto) that cmake writes into compile_commands.json.
+# clang-tidy forwards compiler warnings via clang-diagnostic-*, and .clang-tidy sets
+# WarningsAsErrors: '*', so the warning must be silenced at the compiler level before
+# clang-tidy ever sees it.
+fix_lint_cpp: build_cpp_extensions
+	$(if $(CPP_SOURCES_NO_CUDA),clang-tidy-15 --fix --extra-arg=-Wno-ignored-optimization-argument -p gigl-core/.cache/cmake_build/compile_commands.json $(CPP_SOURCES_NO_CUDA))
+
+lint_test: check_format assert_yaml_configs_parse check_lint_cpp
 	@echo "Lint checks pass!"
 
+# Wipe cmake build caches. Use this if cmake's cached state becomes inconsistent
+# after switching between branches with substantially different CMakeLists.txt structure.
+clean_cpp:
+	$(MAKE) -C gigl-core clean_cpp
+
 # compiles current working state of scala projects to local jars
 compile_jars:
 	@echo "Compiling jars..."
@@ -260,7 +299,7 @@ run_all_e2e_tests:
 # Example:
 # `make compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" compile_gigl_kubeflow_pipeline`
 # Can be a GCS URI as well
-compile_gigl_kubeflow_pipeline: compile_jars push_new_docker_images
+compile_gigl_kubeflow_pipeline: build_cpp_extensions compile_jars push_new_docker_images
 	uv run python -m gigl.orchestration.kubeflow.runner \
 		--action=compile \
 		--container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG} \
@@ -286,7 +325,7 @@ _skip_build_deps:
 # 	job_name=... \ , and other params
 # 	compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" \
 # 	run_dev_gnn_kubeflow_pipeline
-run_dev_gnn_kubeflow_pipeline: $(if $(compiled_pipeline_path), _skip_build_deps, compile_jars push_new_docker_images)
+run_dev_gnn_kubeflow_pipeline: $(if $(compiled_pipeline_path), _skip_build_deps, build_cpp_extensions compile_jars push_new_docker_images)
 	uv run python -m gigl.orchestration.kubeflow.runner \
 		$(if $(compiled_pipeline_path),,--container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG}) \
 		$(if $(compiled_pipeline_path),,--container_image_cpu=${DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG}) \
@@ -311,7 +350,10 @@ clean_build_files_scala:
 	( cd scala; sbt clean; find . -type d -name "target" -prune -exec rm -rf {} \; )
 	( cd scala_spark35; sbt clean; find . -type d -name "target" -prune -exec rm -rf {} \; )
 
-clean_build_files: clean_build_files_py clean_build_files_scala
+clean_build_files_cpp:
+	$(MAKE) -C gigl-core clean_build_files_cpp
+
+clean_build_files: clean_build_files_py clean_build_files_scala clean_build_files_cpp
 
 # Call to generate new proto definitions if any of the .proto files have been changed.
 # We intentionally rebuild *all* protos with one commmand as they should all be in sync.
diff --git a/RELEASING.md b/RELEASING.md
new file mode 100644
index 000000000..95a46d5f6
--- /dev/null
+++ b/RELEASING.md
@@ -0,0 +1,64 @@
+# Releasing GiGL
+
+## Two-wheel model
+
+GiGL is distributed as two wheels that are always installed together:
+
+- **`gigl`** — pure Python (same wheel for CPU and CUDA users)
+- **`gigl-core`** — compiled C++/CUDA extensions, ABI-bound to the torch variant
+
+Both wheels are always versioned and released together. `bump_version.py` updates both versions and keeps the
+`gigl-core` pin in `pyproject.toml` in sync automatically.
+
+## Release process
+
+A full release involves two GitHub Actions workflows run in sequence.
+
+### Step 1 — Create Release (`create_release.yml`)
+
+This workflow bumps the version, creates the release branch and tag, releases the KFP pipeline, and opens the merge-back
+PR. Trigger it manually:
+
+1. Go to the **Actions** tab in the GitHub repository.
+2. Select **Create Release** from the left sidebar.
+3. Click **Run workflow**, choose the bump type (`major`, `minor`, or `patch`), and click **Run workflow**.
+
+The workflow will:
+
+- Bump the version in `pyproject.toml` (both `gigl` and `gigl-core`) and commit it to a new `release/vX.Y.Z` branch.
+- Release the GiGL KFP pipeline from that branch.
+- Create and push a version tag `vX.Y.Z`.
+- Open a PR to merge `release/vX.Y.Z` back to `main`.
+
+### Step 2 — Release GiGL (`release.yml`)
+
+This workflow builds and publishes the `gigl` and `gigl-core` wheels. Trigger it from the release branch created in Step
+1:
+
+1. Go to the **Actions** tab.
+2. Select **Release GiGL** from the left sidebar.
+3. Click **Run workflow**, select the `release/vX.Y.Z` branch from the branch dropdown, and click **Run workflow**.
+
+The workflow runs two jobs in parallel — one on a CPU runner, one on a GPU runner — and publishes both wheels to both
+variant registries.
+
+### Step 3 — Merge the PR
+
+Once both workflows succeed, merge the PR opened by **Create Release** to bring the version bump back into `main`.
+
+## Nightly releases
+
+Nightly builds are triggered automatically by `nightly_release_&_test.yml`, which calls `create_release.yml` with
+`bump_type=nightly`. The **Release GiGL** wheel-publish step is not part of the nightly flow.
+
+## What gets published
+
+Each release run publishes to two self-contained registries:
+
+| Registry                     | Packages                                             |
+| ---------------------------- | ---------------------------------------------------- |
+| `gcp-release-registry-cpu`   | `gigl` (pure Python) + `gigl-core` (CPU wheel)       |
+| `gcp-release-registry-cu128` | `gigl` (pure Python) + `gigl-core` (CUDA 12.8 wheel) |
+
+Users install from exactly one registry based on their variant — see
+[installation docs](docs/user_guide/getting_started/installation.md).
diff --git a/containers/Dockerfile.builder b/containers/Dockerfile.builder
index b228fc263..6388a5c6b 100644
--- a/containers/Dockerfile.builder
+++ b/containers/Dockerfile.builder
@@ -62,6 +62,12 @@ COPY gigl/scripts gigl/scripts
 
 
 COPY .python-version tmp/.python-version
+# gigl-core is a path dependency in pyproject.toml. uv sync needs its metadata to
+# resolve the lockfile. Copying only the build manifest (no C++ sources) so cmake
+# configures but compiles nothing — the src Dockerfile installs the real wheel later.
+COPY gigl-core/pyproject.toml gigl-core/pyproject.toml
+COPY gigl-core/CMakeLists.txt gigl-core/CMakeLists.txt
+COPY gigl-core/README.md gigl-core/README.md
 RUN  bash ./requirements/install_py_deps.sh --dev
 
 # The UV_PROJECT_ENVIRONMENT environment variable can be used to configure the project virtual environment path
diff --git a/containers/Dockerfile.cpu.base b/containers/Dockerfile.cpu.base
index 4ef552789..cba87627f 100644
--- a/containers/Dockerfile.cpu.base
+++ b/containers/Dockerfile.cpu.base
@@ -26,6 +26,12 @@ COPY uv.lock uv.lock
 COPY requirements requirements
 COPY gigl/scripts gigl/scripts
 COPY .python-version .python-version
+# gigl-core is a path dependency in pyproject.toml. uv sync needs its metadata to
+# resolve the lockfile. Copying only the build manifest (no C++ sources) so cmake
+# configures but compiles nothing — the src Dockerfile installs the real wheel later.
+COPY gigl-core/pyproject.toml gigl-core/pyproject.toml
+COPY gigl-core/CMakeLists.txt gigl-core/CMakeLists.txt
+COPY gigl-core/README.md gigl-core/README.md
 
 RUN bash ./requirements/install_py_deps.sh
 
diff --git a/containers/Dockerfile.cuda.base b/containers/Dockerfile.cuda.base
index fdca2c01b..f6d05397f 100644
--- a/containers/Dockerfile.cuda.base
+++ b/containers/Dockerfile.cuda.base
@@ -31,6 +31,12 @@ COPY pyproject.toml pyproject.toml
 COPY uv.lock uv.lock
 COPY requirements requirements
 COPY gigl/scripts gigl/scripts
+# gigl-core is a path dependency in pyproject.toml. uv sync needs its metadata to
+# resolve the lockfile. Copying only the build manifest (no C++ sources) so cmake
+# configures but compiles nothing — the src Dockerfile installs the real wheel later.
+COPY gigl-core/pyproject.toml gigl-core/pyproject.toml
+COPY gigl-core/CMakeLists.txt gigl-core/CMakeLists.txt
+COPY gigl-core/README.md gigl-core/README.md
 
 RUN bash ./requirements/install_py_deps.sh
 
diff --git a/containers/Dockerfile.dataflow.base b/containers/Dockerfile.dataflow.base
index 76a57d14d..7eb98df8d 100644
--- a/containers/Dockerfile.dataflow.base
+++ b/containers/Dockerfile.dataflow.base
@@ -25,6 +25,12 @@ COPY pyproject.toml pyproject.toml
 COPY uv.lock uv.lock
 COPY requirements requirements
 COPY gigl/scripts gigl/scripts
+# gigl-core is a path dependency in pyproject.toml. uv sync needs its metadata to
+# resolve the lockfile. Copying only the build manifest (no C++ sources) so cmake
+# configures but compiles nothing — the src Dockerfile installs the real wheel later.
+COPY gigl-core/pyproject.toml gigl-core/pyproject.toml
+COPY gigl-core/CMakeLists.txt gigl-core/CMakeLists.txt
+COPY gigl-core/README.md gigl-core/README.md
 
 RUN bash ./requirements/install_py_deps.sh --skip-glt-post-install
 
diff --git a/containers/Dockerfile.dataflow.src b/containers/Dockerfile.dataflow.src
index b5d29c7f0..d9b9c6ac7 100644
--- a/containers/Dockerfile.dataflow.src
+++ b/containers/Dockerfile.dataflow.src
@@ -13,6 +13,12 @@ COPY deployment deployment
 COPY gigl gigl
 COPY snapchat snapchat
 COPY tests tests
+
+# Build and install gigl-core C++ extensions first. gigl-core/ must be present
+# before `uv pip install` because gigl declares it as a path dependency.
+COPY gigl-core gigl-core
+RUN uv pip install gigl-core/
+
 RUN uv pip install -e .
 
 WORKDIR /
diff --git a/containers/Dockerfile.src b/containers/Dockerfile.src
index b80295962..4d77c6325 100644
--- a/containers/Dockerfile.src
+++ b/containers/Dockerfile.src
@@ -1,15 +1,11 @@
 ARG BASE_IMAGE
 FROM $BASE_IMAGE
 
-# Copy the source
-WORKDIR /gigl
-
-
 # Note: main package files must live in root of the repo for the python package to be built correctly for Dataflow workers.
 # See https://beam.apache.org/documentation/sdks/python-pipxeline-dependencies/#create-reproducible-environments.
 WORKDIR /gigl
 
-COPY MANIFEST.in MANIFEST.in
+COPY README.md README.md
 COPY pyproject.toml pyproject.toml
 COPY uv.lock uv.lock
 COPY gigl/dep_vars.env gigl/dep_vars.env
@@ -19,4 +15,10 @@ COPY snapchat snapchat
 COPY tests tests
 COPY examples examples
 
-RUN uv pip install -e .
+# Build and install gigl-core C++ extensions first. gigl-core/ must be present
+# before `uv pip install .` because gigl declares it as a path dependency.
+COPY gigl-core gigl-core
+RUN uv pip install gigl-core/
+
+# Install gigl (pure Python — fast, no CMake). gigl-core is already satisfied above.
+RUN uv pip install .
diff --git a/docs/cpp_style_guide.md b/docs/cpp_style_guide.md
new file mode 100644
index 000000000..0a9e684ab
--- /dev/null
+++ b/docs/cpp_style_guide.md
@@ -0,0 +1,194 @@
+# C++ Style Guide
+
+GiGL enforces C++ style automatically via two tools:
+
+- **clang-format** (`.clang-format`) — code formatting
+- **clang-tidy** (`.clang-tidy`) — static analysis and lint
+
+All clang-tidy warnings are treated as errors.
+
+## Running the Tools
+
+```bash
+make format_cpp        # Format all C++ files in-place (clang-format)
+make check_format_cpp  # Check formatting without modifying (clang-format only, not lint)
+make check_lint_cpp    # Run clang-tidy static analysis
+```
+
+> **Note — CUDA files (`.cu`) are excluded from clang-tidy.** clang-tidy 15 does not support CUDA syntax and will error
+> on `.cu` files. The Makefile defines `CPP_SOURCES_NO_CUDA` (which filters out `.cu` files) and passes only that set to
+> clang-tidy. If you add a new `.cu` file, it will not appear in lint output — this is expected. Lint coverage for CUDA
+> files requires upgrading to a clang-tidy version with CUDA support.
+
+______________________________________________________________________
+
+## Build Configuration
+
+All builds use `-O3 -g`: full optimization with debug symbols always enabled. Debug symbols add no runtime overhead and
+ensure stack traces are always readable.
+
+______________________________________________________________________
+
+## Formatting (`.clang-format`)
+
+The style is based on LLVM with the following notable deviations:
+
+### Line length
+
+```
+ColumnLimit: 120
+```
+
+120 columns rather than the LLVM default of 80. ML and graph code tends to have longer identifiers and nested template
+types; 120 gives enough room without forcing awkward wraps.
+
+### Indentation and braces
+
+```
+IndentWidth: 4
+BreakBeforeBraces: Attach   # K&R / "same-line" style
+UseTab: Never
+IndentCaseLabels: true      # case labels indented inside switch
+NamespaceIndentation: None  # namespace bodies not indented
+```
+
+### Pointer and reference alignment
+
+```
+PointerAlignment: Left
+```
+
+Pointers bind to the type, not the name: `int* x`, not `int *x`.
+
+### Parameter and argument wrapping
+
+```
+BinPackArguments: false
+BinPackParameters: false
+```
+
+When a function call or declaration doesn't fit on one line, every argument/parameter gets its own line. Mixed
+"bin-packing" (some on one line, some wrapped) is not allowed.
+
+### Templates
+
+```
+AlwaysBreakTemplateDeclarations: true
+```
+
+`template <...>` always appears on its own line, keeping the declaration signature visually separate from the template
+header.
+
+### Include ordering
+
+Includes are sorted and split into three priority groups (lower number = appears first in the file):
+
+| Priority | Pattern                | Group                                |
+| -------- | ---------------------- | ------------------------------------ |
+| 1        | `^<(torch\|pybind11)/` | Torch and pybind11 headers (first)   |
+| 2        | `^(<\|"gtest/)`        | System and other third-party headers |
+| 3        | `.*`                   | Local project headers (last)         |
+
+> When GLT (`graphlearn_torch`) headers are added, include `graphlearn_torch` in the Priority 1 pattern.
+
+### Raw string formatting
+
+Raw string literals with the `pb` delimiter (e.g. `R"pb(...)pb"`) are formatted as TextProto using Google style,
+matching the protobuf idiom used throughout the codebase.
+
+______________________________________________________________________
+
+## Static Analysis (`.clang-tidy`)
+
+### Check philosophy
+
+A broad set of check families is enabled to catch bugs, enforce modern C++ idioms, and maintain readability. All
+warnings are errors — there is no "warning-only" category.
+
+Enabled families:
+
+| Family                      | What it covers                                                                                                                         |
+| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `boost-use-to-string`       | Prefer `std::to_string` over `boost::lexical_cast` for numeric conversions                                                             |
+| `bugprone-*`                | Common programming mistakes: dangling handles, suspicious string construction, assert side effects, etc.                               |
+| `cert-*`                    | CERT secure coding rules for error handling (`err34-c`), floating-point loops (`flp30-c`), and RNG seeding (`msc32-c`, `msc50/51-cpp`) |
+| `clang-diagnostic-*`        | Compiler diagnostic warnings surfaced as lint checks (e.g. `-Wall`, `-Wextra` violations)                                              |
+| `cppcoreguidelines-*`       | C++ Core Guidelines: no raw `malloc`, no union member access, no object slicing, safe downcasts                                        |
+| `google-*`                  | Google C++ style: explicit constructors, no global names in headers, safe `memset` usage                                               |
+| `hicpp-exception-baseclass` | All thrown exceptions must derive from `std::exception`                                                                                |
+| `misc-*`                    | Miscellaneous: header-only definitions, suspicious enum usage, throw-by-value/catch-by-reference, etc.                                 |
+| `modernize-*`               | Modernize to C++11/14/17: `nullptr`, range-based for, `make_unique`, `using` aliases, etc.                                             |
+| `performance-*`             | Unnecessary copies, inefficient string ops, missed `emplace`, type promotions in math functions                                        |
+| `readability-*`             | Naming conventions, braces around statements, boolean simplification, function size limits                                             |
+
+### Disabled checks
+
+Some checks in the above families are disabled where they produce excessive noise or conflict with common patterns in
+this codebase:
+
+| Disabled check                                        | Reason                                                                                                                                                                                                                                                 |
+| ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `bugprone-easily-swappable-parameters`                | Tensor and sampler APIs legitimately have many adjacent same-typed parameters                                                                                                                                                                          |
+| `bugprone-implicit-widening-of-multiplication-result` | Crashes clang-tidy 15 on a construct in `ATen/core/dynamic_type.h` (upstream LLVM bug). Re-enable when upgrading past clang-tidy 15.                                                                                                                   |
+| `bugprone-narrowing-conversions`                      | Too noisy in ML code mixing `int`/`int64_t`/`size_t` for tensor dimensions                                                                                                                                                                             |
+| `misc-confusable-identifiers`                         | Performs an O(n²) comparison of all identifiers in scope to detect Unicode homoglyphs. PyTorch headers introduce thousands of identifiers, making this check account for ~70% of total lint time. All identifiers in this codebase are standard ASCII. |
+| `misc-const-correctness`                              | Produces false positives with pybind11 types whose mutation happens through `operator[]` (which is non-const). The check incorrectly suggests `const` on variables that are mutated.                                                                   |
+| `misc-no-recursion`                                   | Recursive graph algorithms are intentional                                                                                                                                                                                                             |
+| `modernize-avoid-c-arrays`                            | C arrays are needed for pybind11 and C-interop code                                                                                                                                                                                                    |
+| `modernize-use-trailing-return-type`                  | Trailing return types (`auto f() -> T`) are only useful when the return type depends on template params. Requiring them everywhere is non-standard and reduces readability.                                                                            |
+| `readability-avoid-const-params-in-decls`             | Incorrectly fires on `const T&` parameters in multi-line declarations (clang-tidy 15 bug). The check is meant for top-level const on by-value params, which is a separate, valid concern.                                                              |
+| `readability-container-contains`                      | `.contains()` requires C++20; the codebase builds with C++17                                                                                                                                                                                           |
+| `readability-identifier-length`                       | Short loop variables (`i`, `j`, `k`) are idiomatic                                                                                                                                                                                                     |
+| `readability-function-cognitive-complexity`           | Algorithmic code often requires nesting that is inherent to the problem structure. Enforcing an arbitrary complexity ceiling discourages clarity and encourages artificial decomposition.                                                              |
+| `readability-magic-numbers`                           | Literal constants are common in ML code (e.g. feature dimensions)                                                                                                                                                                                      |
+
+### Naming conventions
+
+Enforced via `readability-identifier-naming`:
+
+| Identifier kind                                           | Convention                   | Example           |
+| --------------------------------------------------------- | ---------------------------- | ----------------- |
+| Classes, enums, unions                                    | `PascalCase`                 | `DistDataset`     |
+| Type template parameters                                  | `PascalCase`                 | `NodeType`        |
+| Functions, methods                                        | `camelCase`                  | `sampleNeighbors` |
+| Variables, parameters, members                            | `camelCase`                  | `numNodes`        |
+| Private/protected members                                 | `camelCase` with `_` prefix  | `_nodeFeatures`   |
+| Constants (`constexpr`, `const` globals, class constants) | `PascalCase` with `k` prefix | `kMaxBatchSize`   |
+
+> **Note — clang-tidy option names:** `PascalCase` maps to clang-tidy's `CamelCase` enum value; `camelCase` maps to
+> `camelBack`.
+
+### Key option tuning
+
+| Option                                                     | Value                  | Effect                                                                                                                                                                                |
+| ---------------------------------------------------------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `WarningsAsErrors`                                         | `*`                    | Every check failure is a hard error in CI                                                                                                                                             |
+| `HeaderFilterRegex`                                        | `.*/gigl-core/csrc/.*` | Scopes checks to our own headers. Using `.*` causes clang-tidy to report warnings from every PyTorch/pybind11 header it parses, flooding output with thousands of third-party issues. |
+| `FormatStyle`                                              | `none`                 | clang-tidy does not auto-reformat; use clang-format separately                                                                                                                        |
+| `bugprone-string-constructor.LargeLengthThreshold`         | `8388608` (8 MB)       | Strings larger than 8 MB from a length argument are flagged                                                                                                                           |
+| `modernize-loop-convert.NamingStyle`                       | `camelBack`            | Auto-generated loop variable names use camelBack, matching `readability-identifier-naming.VariableCase`                                                                               |
+| `readability-function-size.LineThreshold`                  | `1000`                 | Functions over 1000 lines are flagged                                                                                                                                                 |
+| `readability-braces-around-statements.ShortStatementLines` | `0`                    | Braces required for all control-flow bodies, even single-line                                                                                                                         |
+
+______________________________________________________________________
+
+## pybind11 Extension Modules
+
+Extension modules live under `gigl-core/csrc/`.
+
+### Naming convention
+
+| File                       | Purpose                                                          |
+| -------------------------- | ---------------------------------------------------------------- |
+| `python_<name>.cpp`        | pybind11 bindings — contains the `PYBIND11_MODULE` definition    |
+| `<name>.cpp` / `<name>.cu` | Implementation — function and class definitions                  |
+| `<name>.h`                 | Declarations (function signatures, class definitions, constants) |
+
+Example: to add a `my_op` extension under `gigl-core/csrc/sampling/`:
+
+```
+gigl-core/csrc/sampling/python_my_op.cpp   ← pybind11 bindings
+gigl-core/csrc/sampling/my_op.cpp          ← implementation
+```
+
+The compiled `.so` is installed into the `gigl_core` package and importable as `gigl_core.<name>`.
diff --git a/docs/user_guide/getting_started/installation.md b/docs/user_guide/getting_started/installation.md
index 15d78eec4..b823837a8 100644
--- a/docs/user_guide/getting_started/installation.md
+++ b/docs/user_guide/getting_started/installation.md
@@ -6,12 +6,24 @@ These are the current environments supported by GiGL
 
 | Python | Mac (Arm64) CPU | Linux CPU | Linux CUDA | PyTorch | PyG |
 | ------ | --------------- | --------- | ---------- | ------- | --- |
-| 3.9    | Partial Support | Supported | 12.1       | 2.5     | 2.5 |
+| 3.11   | Supported       | Supported | 12.8       | 2.8     | 2.7 |
 
 ## Available Versions
 
-You can see the available wheels for GiGL
-[here](https://console.cloud.google.com/artifacts/python/external-snap-ci-github-gigl/us-central1/gigl/gigl?project=external-snap-ci-github-gigl)
+GiGL is distributed as two wheels that are installed together:
+
+- **`gigl`** — pure Python package (same wheel for CPU and CUDA users)
+- **`gigl-core`** — compiled C++/CUDA extensions, ABI-bound to the torch variant
+
+You do not need to install `gigl-core` directly; it is a dependency of `gigl` and is resolved automatically from the
+same registry.
+
+Each registry is self-contained — you only need one GCP extra-index URL:
+
+| Variant   | Registry                                                                                                                                                                      |
+| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CPU       | [gigl (CPU registry)](https://console.cloud.google.com/artifacts/python/external-snap-ci-github-gigl/us-central1/gigl/gigl?project=external-snap-ci-github-gigl)              |
+| CUDA 12.8 | [gigl-cu128 (CUDA registry)](https://console.cloud.google.com/artifacts/python/external-snap-ci-github-gigl/us-central1/gigl/gigl-cu128?project=external-snap-ci-github-gigl) |
 
 ## Install Prerequisites - setting up your dev machine
 
@@ -103,33 +115,28 @@ Below we provide two ways to bootstrap an environment for using and/or developin
 
 2. Install GiGL
 
-#### Install GiGL + necessary tooling for PyG 2.7 + Torch 2.8 on Cuda12.8
+#### Install GiGL + necessary tooling for PyG 2.7 + Torch 2.8 on CUDA 12.8
 
 ```bash
-pip install "gigl[pyg27-torch28-cu128, transform]==0.1.0" \
---extra-index-url=https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl/simple/ \
+pip install "gigl[pyg27-torch28-cu128, transform]==0.2.0" \
+--extra-index-url=https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl-cu128/simple/ \
 --extra-index-url=https://download.pytorch.org/whl/cu128 \
 --extra-index-url=https://data.pyg.org/whl/torch-2.8.0+cu128.html
 ```
 
-Currently, the dependency used for in-memory subgraph sampling is easiest to install from source, so we run the
-post-install script each time:
-
-```bash
-gigl-post-install
-```
-
 #### Install GiGL + necessary tooling for PyG 2.7 + Torch 2.8 on CPU
 
 ```bash
-pip install "gigl[pyg27-torch28-cpu, transform]==0.1.0" \
+pip install "gigl[pyg27-torch28-cpu, transform]==0.2.0" \
 --extra-index-url=https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl/simple/ \
 --extra-index-url=https://download.pytorch.org/whl/cpu \
 --extra-index-url=https://data.pyg.org/whl/torch-2.8.0+cpu.html
 ```
 
-Currently, the dependency used for in-memory subgraph sampling is easiest to install from source, so we run the
-post-install script each time:
+pip resolves and installs `gigl-core` automatically from the same GCP registry. No separate install step is needed.
+
+Currently, building/using wheels for GLT is error prone, thus we opt to install from source every time. Run post-install
+script to setup GLT dependency:
 
 ```bash
 gigl-post-install
diff --git a/gigl-core/CMakeLists.txt b/gigl-core/CMakeLists.txt
new file mode 100644
index 000000000..5b0e6d9f2
--- /dev/null
+++ b/gigl-core/CMakeLists.txt
@@ -0,0 +1,121 @@
+cmake_minimum_required(VERSION 3.18)
+project(gigl_core CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# CMP0104: CMake 3.18+ warns when enable_language(CUDA) is called without
+# CMAKE_CUDA_ARCHITECTURES being set. Set it to OFF so CMake does not inject
+# any -gencode flags (torch provides its own arch list via TORCH_LIBRARIES).
+cmake_policy(SET CMP0104 NEW)
+set(CMAKE_CUDA_ARCHITECTURES OFF)
+
+# Enable CUDA only when the toolkit is present; allows the same CMakeLists.txt
+# to build on CPU-only machines without requiring nvcc.
+include(CheckLanguage)
+check_language(CUDA)
+if(NOT CMAKE_CUDA_COMPILER)
+    find_program(CMAKE_CUDA_COMPILER nvcc HINTS /usr/local/cuda/bin)
+endif()
+if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+
+# ---------------------------------------------------------------------------
+# Extension modules — auto-discovered.
+# Files named python_*.cpp under csrc/ are compiled as pybind11 extension
+# modules. The companion <name>.cpp (without the "python_" prefix) is included
+# automatically when present. Add a new extension by dropping source files
+# here; no changes to this CMakeLists.txt are needed.
+# ---------------------------------------------------------------------------
+if(CMAKE_CUDA_COMPILER)
+    file(GLOB_RECURSE _PYTHON_SRCS CONFIGURE_DEPENDS
+        "${CMAKE_CURRENT_SOURCE_DIR}/csrc/python_*.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/csrc/python_*.cu"
+    )
+else()
+    file(GLOB_RECURSE _PYTHON_SRCS CONFIGURE_DEPENDS
+        "${CMAKE_CURRENT_SOURCE_DIR}/csrc/python_*.cpp"
+    )
+endif()
+
+if(_PYTHON_SRCS)
+    find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
+
+    # Locate pybind11 and torch cmake configs by scanning CMAKE_PREFIX_PATH,
+    # which scikit-build-core sets to the active environment's site-packages.
+    foreach(_prefix IN LISTS CMAKE_PREFIX_PATH)
+        if(NOT pybind11_DIR AND EXISTS "${_prefix}/pybind11/share/cmake/pybind11")
+            set(pybind11_DIR "${_prefix}/pybind11/share/cmake/pybind11")
+        endif()
+        if(NOT TORCH_CMAKE_PREFIX AND EXISTS "${_prefix}/torch/share/cmake")
+            set(TORCH_CMAKE_PREFIX "${_prefix}/torch/share/cmake")
+        endif()
+    endforeach()
+
+    # pybind11 is provided by the scikit-build-core isolated build env when building a
+    # wheel. For direct cmake invocations (make unit_test_cpp), fall back to FetchContent
+    # so pybind11 does not need to be pre-installed in the dev venv.
+    if(pybind11_DIR)
+        find_package(pybind11 CONFIG REQUIRED)
+    else()
+        include(FetchContent)
+        FetchContent_Declare(
+            pybind11
+            GIT_REPOSITORY https://github.com/pybind/pybind11.git
+            GIT_TAG        v2.13.6
+            GIT_SHALLOW    TRUE
+        )
+        FetchContent_MakeAvailable(pybind11)
+    endif()
+
+    if(NOT TORCH_CMAKE_PREFIX)
+        message(FATAL_ERROR "Cannot find torch cmake config in CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}. Run: make install_dev_deps")
+    endif()
+
+    find_package(Torch REQUIRED PATHS "${TORCH_CMAKE_PREFIX}")
+
+    # torch_python provides the pybind11 type casters for at::Tensor. It is not
+    # included in TORCH_LIBRARIES but is required for extensions that pass
+    # tensors across the Python/C++ boundary.
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib" REQUIRED)
+
+    set(GIGL_COMPILE_FLAGS -O3 -g -Wall -Wextra -Wno-unused-parameter)
+    # nvcc does not accept bare -Wall/-Wextra; wrap with -Xcompiler for CUDA.
+    set(GIGL_COMPILE_FLAGS_CUDA -O3 -Xcompiler=-g,-Wall,-Wextra,-Wno-unused-parameter)
+
+    foreach(_py_src IN LISTS _PYTHON_SRCS)
+        get_filename_component(_dir  "${_py_src}" DIRECTORY)
+        get_filename_component(_stem "${_py_src}" NAME_WE)
+        string(REGEX REPLACE "^python_" "" _name "${_stem}")
+
+        set(_sources "${_py_src}")
+        if(EXISTS "${_dir}/${_name}.cpp")
+            list(APPEND _sources "${_dir}/${_name}.cpp")
+        endif()
+        if(CMAKE_CUDA_COMPILER AND EXISTS "${_dir}/${_name}.cu")
+            list(APPEND _sources "${_dir}/${_name}.cu")
+        endif()
+
+        pybind11_add_module("${_name}" ${_sources})
+        target_link_libraries("${_name}" PRIVATE "${TORCH_LIBRARIES}" "${TORCH_PYTHON_LIBRARY}")
+        target_compile_options("${_name}" PRIVATE
+            $<$<COMPILE_LANGUAGE:CXX>:${GIGL_COMPILE_FLAGS}>
+            $<$<COMPILE_LANGUAGE:CUDA>:${GIGL_COMPILE_FLAGS_CUDA}>
+        )
+        # TORCH_EXTENSION_NAME is used in PYBIND11_MODULE() to name the module.
+        # PyTorch's own build system sets this; we must define it explicitly here.
+        target_compile_definitions("${_name}" PRIVATE "TORCH_EXTENSION_NAME=${_name}")
+        # Install into gigl_core/ so the .so is importable as gigl_core.<name>.
+        install(TARGETS "${_name}" DESTINATION gigl_core)
+    endforeach()
+endif()
+
+option(GIGL_CORE_BUILD_TESTS "Build C++ unit tests (used by make unit_test_cpp)" OFF)
+if(GIGL_CORE_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+endif()
diff --git a/gigl-core/Makefile b/gigl-core/Makefile
new file mode 100644
index 000000000..f3f63c7b8
--- /dev/null
+++ b/gigl-core/Makefile
@@ -0,0 +1,46 @@
+# C++ build, test, format, and clean targets for gigl-core.
+# Invoked from the GiGL repo root via: $(MAKE) -C gigl-core <target>
+# All paths below are relative to gigl-core/.
+
+CPP_SOURCES := $(shell find csrc \( -name "*.cpp" -o -name "*.cu" \) 2>/dev/null)
+# clang-tidy 15 does not fully support CUDA syntax (e.g. <<<...>>>, __global__).
+# Exclude .cu files from tidy targets; clang-format and clangd handle them fine.
+CPP_SOURCES_NO_CUDA := $(filter-out %.cu,$(CPP_SOURCES))
+
+# Stamp-file guard: uv pip install -e gigl-core/ triggers a full CMake configure-and-build
+# cycle even when nothing changed. By making the stamp file depend on C++ sources and
+# pyproject.toml, make skips the reinstall unless something actually changed.
+# We cd to the repo root so that no-build-isolation-package in the root pyproject.toml
+# is respected by uv pip install.
+.cache/cmake_build/CMakeInit.txt: $(shell find csrc \( -name '*.cpp' -o -name '*.cu' -o -name '*.h' -o -name '*.cuh' \) 2>/dev/null) CMakeLists.txt pyproject.toml
+	cd $(abspath $(CURDIR)/..) && uv pip install -e gigl-core/
+
+build_cpp_extensions: .cache/cmake_build/CMakeInit.txt
+
+.cache/cpp_tests/.configured: CMakeLists.txt tests/CMakeLists.txt .cache/cmake_build/CMakeInit.txt
+	cmake -C .cache/cmake_build/CMakeInit.txt -S . -B .cache/cpp_tests -DGIGL_CORE_BUILD_TESTS=ON
+	touch .cache/cpp_tests/.configured
+
+unit_test_cpp: .cache/cpp_tests/.configured
+	cmake --build .cache/cpp_tests --parallel
+	ctest --test-dir .cache/cpp_tests --output-on-failure
+
+# TODO: Remove the $(if ...) guards once C++ source files are permanently present in the
+# repo. The guards exist to silently no-op on branches that have no python_*.cpp files yet.
+check_format_cpp:
+	$(if $(CPP_SOURCES),clang-format-15 --dry-run --Werror --style=file $(CPP_SOURCES))
+
+format_cpp:
+	$(if $(CPP_SOURCES),clang-format-15 -i --style=file $(CPP_SOURCES))
+
+# Wipe cmake build caches. Use this if cmake's cached state becomes inconsistent
+# after switching between branches with substantially different CMakeLists.txt structure.
+clean_cpp:
+	rm -rf .cache/cpp_tests .cache/cmake_build
+
+clean_build_files_cpp:
+	rm -rf .cache/cpp_tests
+
+# Declare targets as phony so make always runs their recipes, even if a file or
+# directory with the same name happens to exist on disk.
+.PHONY: build_cpp_extensions unit_test_cpp check_format_cpp format_cpp clean_cpp clean_build_files_cpp
diff --git a/gigl-core/README.md b/gigl-core/README.md
new file mode 100644
index 000000000..c02f9301e
--- /dev/null
+++ b/gigl-core/README.md
@@ -0,0 +1,12 @@
+# gigl-core
+
+C++/CUDA pybind11 extension modules for [GiGL](https://github.com/snapchat/gigl).
+
+This package contains the compiled native extensions. It is a workspace member of the main `gigl` package and is built
+separately via scikit-build-core.
+
+## Building
+
+```bash
+uv pip install -e gigl-core/
+```
diff --git a/gigl-core/pyproject.toml b/gigl-core/pyproject.toml
new file mode 100644
index 000000000..8fc595572
--- /dev/null
+++ b/gigl-core/pyproject.toml
@@ -0,0 +1,36 @@
+[project]
+name            = "gigl-core"
+description     = "GiGL C++/CUDA kernels (pybind11 extensions)"
+readme          = "README.md"
+version         = "0.2.0"
+requires-python = "==3.11.*"
+# Torch is resolved from the ambient environment. gigl-core wheels are ABI-bound
+# to the torch variant they were built against (cpu or cu128). The parent `gigl`
+# extras (pyg27-torch28-cpu / pyg27-torch28-cu128) already pin torch==2.8, and
+# pip picks the matching gigl-core wheel from whichever GCP registry the user
+# configured for installing gigl itself.
+dependencies = []
+
+[build-system]
+requires      = ["scikit-build-core>=0.10", "pybind11>=2.12"]
+build-backend = "scikit_build_core.build"
+
+[tool.scikit-build]
+cmake.version = ">=3.18"
+build-dir     = ".cache/cmake_build"
+# Default editable mode is `redirect`: scikit-build-core writes compiled .so
+# files into .cache/cmake_build/ and installs an import-hook shim into
+# site-packages that resolves them. `editable.rebuild` stays off so imports
+# never block on CMake — explicit `uv pip install -e gigl-core/` drives rebuilds.
+editable.rebuild = false
+
+[tool.uv]
+# Invalidate the uv build cache when any of these inputs change.
+cache-keys = [
+    { file = "pyproject.toml" },
+    { file = "CMakeLists.txt" },
+    { file = "csrc/**/*.h" },
+    { file = "csrc/**/*.cpp" },
+    { file = "csrc/**/*.cu" },
+    { file = "csrc/**/*.cuh" },
+]
diff --git a/gigl-core/requirements/install_cpp_deps.sh b/gigl-core/requirements/install_cpp_deps.sh
new file mode 100644
index 000000000..497e4cb45
--- /dev/null
+++ b/gigl-core/requirements/install_cpp_deps.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Install C++ development tools: clang-format, clang-tidy, cmake.
+#
+# Usage:
+#   bash gigl-core/requirements/install_cpp_deps.sh
+#
+# Called by `make install_dev_deps` alongside install_py_deps.sh and
+# install_scala_deps.sh.
+#
+# NOTE: macOS is not supported. C++ tooling requires GLT, which does not run on macOS.
+
+set -e
+set -x
+
+if [ "$(uname)" == "Darwin" ]; then
+    echo "ERROR: macOS is not supported for C++ tooling (GLT does not run on macOS)." >&2
+    exit 1
+fi
+
+# clang++-15 requires libstdc++-12-dev: on Ubuntu 22.04, clang++-15 looks for GCC 12
+# headers. Without this package clang++-15 cannot find standard headers like <cstddef>.
+sudo apt-get update -y
+sudo apt-get install -y clang-format-15 clang-tidy-15 clangd-15 clang++-15 libstdc++-12-dev cmake
+
+# Verify cmake >= 3.18 (our CMakeLists.txt requires it; Ubuntu 20.04 apt provides 3.16).
+cmake_version=$(cmake --version | awk 'NR==1{print $3}')
+if ! printf '3.18\n%s\n' "$cmake_version" | sort -V -C 2>/dev/null; then
+    echo "ERROR: cmake >= 3.18 required, found $cmake_version. See https://cmake.org/download/" >&2
+    exit 1
+fi
+
+echo "Finished installing C++ tooling"
diff --git a/gigl-core/src/gigl_core/__init__.py b/gigl-core/src/gigl_core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/gigl-core/tests/CMakeLists.txt b/gigl-core/tests/CMakeLists.txt
new file mode 100644
index 000000000..74eac1f40
--- /dev/null
+++ b/gigl-core/tests/CMakeLists.txt
@@ -0,0 +1,52 @@
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
+
+# ---------------------------------------------------------------------------
+# GoogleTest via FetchContent
+# ---------------------------------------------------------------------------
+include(FetchContent)
+FetchContent_Declare(
+    googletest
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG        58d77fa8070e8cec2dc1ed015d66b454c8d78850  # v1.14.0
+    GIT_SHALLOW    TRUE
+)
+# Prevent GoogleTest from overriding the compiler's runtime on Windows
+# (no-op on Linux/Mac, but required for portable CMake config).
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+# Required for add_test() to register tests with CTest.
+enable_testing()
+
+# ---------------------------------------------------------------------------
+# Auto-discover test targets
+# ---------------------------------------------------------------------------
+# Any file named *_test.cpp in this directory (or subdirectories) is
+# automatically compiled into its own test binary and registered with CTest.
+# To add a new test suite, drop a *_test.cpp file here — no changes to this
+# file required.  This matches the *_test.py convention used for Python tests.
+if(CMAKE_CUDA_COMPILER)
+    file(GLOB_RECURSE TEST_SOURCES "*_test.cpp" "*_test.cu")
+else()
+    file(GLOB_RECURSE TEST_SOURCES "*_test.cpp")
+endif()
+
+foreach(test_source ${TEST_SOURCES})
+    # Derive a unique binary name from the path relative to this directory, e.g.:
+    #   foo_test.cpp         → foo_test
+    #   sampling/foo_test.cpp → sampling_foo_test
+    file(RELATIVE_PATH _rel "${CMAKE_CURRENT_SOURCE_DIR}" "${test_source}")
+    string(REPLACE "/" "_" test_name "${_rel}")
+    string(REGEX REPLACE "\\.[^.]+$" "" test_name "${test_name}")
+    add_executable(${test_name} ${test_source})
+    target_link_libraries(${test_name} GTest::gtest_main)
+    # add_test registers the binary with CTest. Each *_test binary is one
+    # CTest entry; GoogleTest itself reports individual TEST() results inside it.
+    add_test(NAME ${test_name} COMMAND ${test_name})
+endforeach()
diff --git a/gigl-core/tests/infrastructure_test.cpp b/gigl-core/tests/infrastructure_test.cpp
new file mode 100644
index 000000000..eb3c1aa3d
--- /dev/null
+++ b/gigl-core/tests/infrastructure_test.cpp
@@ -0,0 +1,12 @@
+// Placeholder C++ unit test.
+//
+// This file exists to verify that the GoogleTest infrastructure compiles and
+// runs end-to-end.
+
+#include <gtest/gtest.h>
+
+// A trivial sanity-check test — if this fails, something is very wrong with
+// the build environment itself.
+TEST(PlaceholderTest, BasicArithmetic) {
+    EXPECT_EQ(1 + 1, 2);
+}
diff --git a/gigl-core/uv.lock b/gigl-core/uv.lock
new file mode 100644
index 000000000..83b24499e
--- /dev/null
+++ b/gigl-core/uv.lock
@@ -0,0 +1,8 @@
+version = 1
+revision = 3
+requires-python = "==3.11.*"
+
+[[package]]
+name = "gigl-core"
+version = "0.2.0"
+source = { editable = "." }
diff --git a/gigl/dep_vars.env b/gigl/dep_vars.env
index 85ef4d21b..4b28e38b7 100644
--- a/gigl/dep_vars.env
+++ b/gigl/dep_vars.env
@@ -1,7 +1,7 @@
 # Note this file only supports static key value pairs so it can be loaded by make, bash, python, and sbt without any additional parsing.
-DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:b34c863a2168c8df5a6da1f6385e5d374f0175d2.91.1
-DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:b34c863a2168c8df5a6da1f6385e5d374f0175d2.91.1
-DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:b34c863a2168c8df5a6da1f6385e5d374f0175d2.91.1
+DOCKER_LATEST_BASE_CUDA_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cuda-base:b598f3d72eee47f5513dcb39460944459a0a012f.108.1
+DOCKER_LATEST_BASE_CPU_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-cpu-base:b598f3d72eee47f5513dcb39460944459a0a012f.108.1
+DOCKER_LATEST_BASE_DATAFLOW_IMAGE_NAME_WITH_TAG=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/gigl-dataflow-base:b598f3d72eee47f5513dcb39460944459a0a012f.108.1
 
 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cuda:0.2.0
 DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU=us-central1-docker.pkg.dev/external-snap-ci-github-gigl/public-gigl/src-cpu:0.2.0
diff --git a/gigl/orchestration/Dockerfile.customer_src b/gigl/orchestration/Dockerfile.customer_src
index 25a5ea762..d7bf6947d 100644
--- a/gigl/orchestration/Dockerfile.customer_src
+++ b/gigl/orchestration/Dockerfile.customer_src
@@ -10,4 +10,4 @@ WORKDIR /gigl
 COPY . .
 
 # Find out if there is 'setup.py' or 'pyproject.toml' in the current directory, if so install it
-RUN if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then pip install -e .; fi
+RUN if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then uv pip install -e .; fi
diff --git a/pyproject.toml b/pyproject.toml
index b0d0345f2..ebbc65ea8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,9 @@ requires-python = "==3.11.*" # Limited by tfx-bsl not having wheels available fo
 dependencies = [
   "argo-workflows",
   "chardet",
+  # gigl-core hosts all C++ / CUDA / pybind11 extensions. Separate wheel per torch
+  # variant (cpu/cu128). Version must match gigl exactly.
+  "gigl-core==0.2.0",
   "google-cloud-aiplatform",
   "google-cloud-dataproc",
   "google-cloud-logging",
@@ -99,9 +102,18 @@ required-environments = [
     "sys_platform == 'linux' and platform_machine == 'x86_64'",
     "sys_platform == 'darwin' and platform_machine == 'arm64'",
 ]
+# gigl-core must build against torch, which is not on PyPI and cannot be installed
+# into an isolated build environment. Disabling isolation lets cmake find torch from
+# the ambient venv (installed by make install_dev_deps / base Docker images).
+no-build-isolation-package = ["gigl-core"]
 
 [dependency-groups]
+# scikit-build-core is gigl-core's PEP 517 build backend. With no-build-isolation-package
+# set, uv does not install [build-system].requires automatically, so it must be present
+# in the ambient environment before any gigl-core build (uv sync, uv build, Dockerfiles).
+gigl-core-build-backend = ["scikit-build-core>=0.10"]
 dev = [
+  {include-group = "gigl-core-build-backend"},
   {include-group = "docs"},
   {include-group = "lint"},
   {include-group = "test"},
@@ -177,12 +189,26 @@ explicit = true
 format = "flat"
 
 # =============================== Google Artifact Registry Index ========================
+# Two separate indexes — one per torch variant — so pip always fetches the correct wheel.
+# If both variants lived in the same index, pip would pick between them arbitrarily since
+# both wheels share the same platform tag (linux_x86_64). Separate indexes remove the
+# ambiguity and are essential once any extension contains CUDA kernels.
+#
+# The CPU registry reuses the original `gigl` repo for backwards compatibility.
+# The CUDA registry is a new `gigl-cu128` repo that must be created in GCP project
+# `external-snap-ci-github-gigl` before the release workflow will succeed.
 [[tool.uv.index]]
-name = "gcp-release-registry"
+name = "gcp-release-registry-cpu"
 url = "https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl/simple/"
 publish-url = "https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl"
 explicit = true
 
+[[tool.uv.index]]
+name = "gcp-release-registry-cu128"
+url = "https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl-cu128/simple/"
+publish-url = "https://us-central1-python.pkg.dev/external-snap-ci-github-gigl/gigl-cu128"
+explicit = true
+
 # ========== Mapping individual packages to their appropriate index =============
 [tool.uv.sources]
 # ============= PyTorch hosted Package Index Mappings
@@ -221,30 +247,34 @@ torch_spline_conv = [
   { extra = "pyg27-torch28-cpu", index = "pyg-torch28-cpu" },
   { extra = "pyg27-torch28-cu128", index = "pyg-torch28-cu128" },
 ]
-
+# gigl-core is a local path dependency. In dev, uv resolves it from the adjacent
+# gigl-core/ directory and builds the C++ extension via scikit-build-core.
+# At release install time, published gigl and gigl-core wheels are resolved
+# from the same GCP registry.
+gigl-core = { path = "gigl-core" }
 
 
 # ===================== Build/Project Configurations ===========================
+# gigl is a pure-Python package. All C++/CUDA kernels live in the gigl-core
+# wheel (see gigl-core/pyproject.toml). Release builds two variants per package:
+# cpu and cu128. gigl's build stays fast because no CMake runs here.
 [build-system]
-requires      = ["setuptools>=61.0.0", "wheel"]
+requires      = ["setuptools>=68"]
 build-backend = "setuptools.build_meta"
 
-
 [tool.setuptools.packages.find]
-where = ["."]  # list of folders that contain the packages
-include = ["gigl*", "snapchat*"]  # Include only packages that match the specified patterns
-
-[project.urls]
-Homepage = "https://github.com/snapchat/gigl"
-
-[project.scripts]
-gigl-post-install = "gigl.scripts.post_install:main"
+include = ["gigl*", "snapchat*"]
 
 [tool.setuptools.package-data]
 # Include dep_vars.env from the root directory
 "gigl" = ["dep_vars.env", "**/*.yaml"]
 "gigl.scripts" = ["*.sh"]
 
+[project.urls]
+Homepage = "https://github.com/snapchat/gigl"
+
+[project.scripts]
+gigl-post-install = "gigl.scripts.post_install:main"
 
 [tool.ruff]
 # Skip generated proto files.
diff --git a/requirements/install_py_deps.sh b/requirements/install_py_deps.sh
index 6a328165e..2fa146556 100644
--- a/requirements/install_py_deps.sh
+++ b/requirements/install_py_deps.sh
@@ -72,7 +72,7 @@ install_uv_if_needed() {
 
         sh uv_installer.sh
         rm -f uv_installer.sh
-        source $HOME/.local/bin/env
+        export PATH="$HOME/.local/bin:$PATH"
     fi
 }
 
@@ -143,7 +143,7 @@ install_gigl_lib_deps() {
         # https://docs.astral.sh/uv/reference/cli/#uv-sync
         uv sync ${extra_deps_clause[@]} --group dev --locked ${flag_use_inexact_match}
     else
-        uv sync ${extra_deps_clause[@]} --locked ${flag_use_inexact_match}
+        uv sync ${extra_deps_clause[@]} --group gigl-core-build-backend --locked ${flag_use_inexact_match}
     fi
 
     # Taken from https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
diff --git a/scripts/bump_version.py b/scripts/bump_version.py
index 49cbaab1f..499f99ed9 100644
--- a/scripts/bump_version.py
+++ b/scripts/bump_version.py
@@ -100,6 +100,17 @@ def update_pyproject(version: str) -> None:
     with open(path, "r") as f:
         content = f.read()
     content = re.sub(r'(version\s*)=\s*"[\d\.]+"', f'\\1= "{version}"', content)
+    # Keep the gigl-core pin in sync with the new version.
+    content = re.sub(r'"gigl-core==[\d\.a-zA-Z]+"', f'"gigl-core=={version}"', content)
+    with open(path, "w") as f:
+        f.write(content)
+
+
+def update_gigl_core_pyproject(version: str) -> None:
+    path = f"{GIGL_ROOT_DIR}/gigl-core/pyproject.toml"
+    with open(path, "r") as f:
+        content = f.read()
+    content = re.sub(r'(version\s*)=\s*"[\d\.]+"', f'\\1= "{version}"', content)
     with open(path, "w") as f:
         f.write(content)
 
@@ -161,6 +172,7 @@ def bump_version(
     )
     update_version(version=new_version)
     update_pyproject(version=new_version)
+    update_gigl_core_pyproject(version=new_version)
 
     print(
         f"Bumped to GiGL Version: {new_version}! To release, raise a PR with these changes and after it is merged, tag main with the version and run make release_gigl."
diff --git a/scripts/run_cpp_lint.py b/scripts/run_cpp_lint.py
new file mode 100644
index 000000000..7e2db01c9
--- /dev/null
+++ b/scripts/run_cpp_lint.py
@@ -0,0 +1,91 @@
+"""Run C++ lint on source files using clangd.
+
+Runs clangd --check on each file in parallel and prints a clean summary.
+Expects compile_commands.json to already exist at
+gigl-core/.cache/cmake_build/compile_commands.json; call
+``make build_cpp_extensions`` first if it is absent or stale
+(``make check_lint_cpp`` does this automatically via a Makefile prerequisite).
+
+Usage::
+
+    uv run python scripts/run_cpp_lint.py <file1.cpp> [file2.cpp] ...
+"""
+
+import re
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+COMPILE_COMMANDS = (
+    _REPO_ROOT / "gigl-core" / ".cache" / "cmake_build" / "compile_commands.json"
+)
+
+# Matches real clang-tidy diagnostics emitted by clangd:
+#   E[HH:MM:SS.mmm] [check-name] Line N: message
+_DIAGNOSTIC_RE = re.compile(r"^E\[[\d:.]+\] (\[.+\] .+)$")
+
+
+def _check_file(source: Path) -> list[str]:
+    result = subprocess.run(
+        [
+            "clangd-15",
+            f"--check={source}",
+            f"--compile-commands-dir={COMPILE_COMMANDS.parent}",
+            "--query-driver=/usr/bin/clang++-15,/usr/bin/g++",
+        ],
+        capture_output=True,
+        text=True,
+    )
+    diagnostics: list[str] = []
+    completed_normally = False
+    for line in result.stderr.splitlines():
+        if "All checks completed" in line:
+            completed_normally = True
+        m = _DIAGNOSTIC_RE.match(line)
+        if m:
+            diagnostics.append(m.group(1))
+    # Only treat a non-zero exit as a crash if clangd didn't reach its normal
+    # completion message. A non-zero exit with "All checks completed" means
+    # clangd found only IDE-action probe failures (tweak: ... ==> FAIL), which
+    # are not lint violations and should be ignored.
+    if not completed_normally and result.returncode != 0:
+        diagnostics = [
+            f"clangd exited with code {result.returncode} (tool error or crash)"
+        ]
+    return diagnostics
+
+
+def main() -> None:
+    sources = [Path(s) for s in sys.argv[1:]]
+    if not sources:
+        sys.exit(0)
+
+    failures: dict[Path, list[str]] = {}
+    with ThreadPoolExecutor() as executor:
+        futures = {executor.submit(_check_file, s): s for s in sources}
+        for future in as_completed(futures):
+            source = futures[future]
+            try:
+                diagnostics = future.result()
+            except Exception as exc:
+                diagnostics = [f"linter error: {exc}"]
+            if diagnostics:
+                failures[source] = diagnostics
+
+    if not failures:
+        print("\033[32mC++ lint passed.\033[0m")
+    else:
+        for source in sorted(failures):
+            print(f"  FAIL  {source}")
+            for d in failures[source]:
+                print(f"        {d}")
+        print(
+            "\nRun \033[1mmake fix_lint_cpp\033[0m to auto-fix violations where possible."
+        )
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/uv.lock b/uv.lock
index 8e369b607..b8303f693 100644
--- a/uv.lock
+++ b/uv.lock
@@ -607,8 +607,8 @@ dependencies = [
     { name = "numpy", marker = "sys_platform != 'darwin'" },
 ]
 wheels = [
-    { url = "https://download-r2.pytorch.org/whl/cpu/fbgemm_gpu-1.3.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4154e803ba762906a604a72aa41685fdd49459fce55cea79d42ac7c45c8770ca" },
-    { url = "https://download-r2.pytorch.org/whl/cpu/fbgemm_gpu-1.3.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:0267ec844b43028f4b9b8e14acd16276e82bb97f91b6b1078f732eb9225b20c6" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/fbgemm_gpu-1.3.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4154e803ba762906a604a72aa41685fdd49459fce55cea79d42ac7c45c8770ca", upload-time = "2025-08-22T18:49:45Z" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/fbgemm_gpu-1.3.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:0267ec844b43028f4b9b8e14acd16276e82bb97f91b6b1078f732eb9225b20c6", upload-time = "2025-08-22T18:49:45Z" },
 ]
 
 [[package]]
@@ -619,7 +619,7 @@ dependencies = [
     { name = "numpy" },
 ]
 wheels = [
-    { url = "https://download-r2.pytorch.org/whl/cu128/fbgemm_gpu-1.3.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:365a2c7f89e89f6d8acf3af5101cbb1651cd1cc64057fd2902feae490814cee3" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/fbgemm_gpu-1.3.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:365a2c7f89e89f6d8acf3af5101cbb1651cd1cc64057fd2902feae490814cee3", upload-time = "2025-08-22T18:50:02Z" },
 ]
 
 [[package]]
@@ -707,6 +707,7 @@ source = { editable = "." }
 dependencies = [
     { name = "argo-workflows" },
     { name = "chardet" },
+    { name = "gigl-core" },
     { name = "google-cloud-aiplatform" },
     { name = "google-cloud-dataproc" },
     { name = "google-cloud-logging" },
@@ -785,6 +786,7 @@ dev = [
     { name = "pre-commit" },
     { name = "pydata-sphinx-theme" },
     { name = "ruff" },
+    { name = "scikit-build-core" },
     { name = "sphinx" },
     { name = "sphinx-autoapi" },
     { name = "sphinx-autodoc-typehints" },
@@ -815,6 +817,9 @@ docs = [
     { name = "sphinx-rtd-theme" },
     { name = "sphinx-tabs" },
 ]
+gigl-core-build-backend = [
+    { name = "scikit-build-core" },
+]
 lint = [
     { name = "mdformat" },
     { name = "mdformat-tables" },
@@ -841,6 +846,7 @@ requires-dist = [
     { name = "chardet" },
     { name = "fbgemm-gpu", marker = "sys_platform != 'darwin' and extra == 'pyg27-torch28-cpu'", specifier = "~=1.3.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "gigl", extra = "pyg27-torch28-cpu" } },
     { name = "fbgemm-gpu", marker = "sys_platform != 'darwin' and extra == 'pyg27-torch28-cu128'", specifier = "~=1.3.0", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "gigl", extra = "pyg27-torch28-cu128" } },
+    { name = "gigl-core", directory = "gigl-core" },
     { name = "google-cloud-aiplatform" },
     { name = "google-cloud-dataproc" },
     { name = "google-cloud-logging" },
@@ -906,6 +912,7 @@ dev = [
     { name = "pre-commit", specifier = "==3.3.2" },
     { name = "pydata-sphinx-theme", specifier = "==0.16.1" },
     { name = "ruff", specifier = "==0.15.10" },
+    { name = "scikit-build-core", specifier = ">=0.10" },
     { name = "sphinx", specifier = "==7.4.7" },
     { name = "sphinx-autoapi", specifier = "==3.6.0" },
     { name = "sphinx-autodoc-typehints", specifier = "==2.3.0" },
@@ -936,6 +943,7 @@ docs = [
     { name = "sphinx-rtd-theme", specifier = "==2.0.0" },
     { name = "sphinx-tabs", specifier = "==3.4.5" },
 ]
+gigl-core-build-backend = [{ name = "scikit-build-core", specifier = ">=0.10" }]
 lint = [
     { name = "mdformat", specifier = "==0.7.22" },
     { name = "mdformat-tables", specifier = "==1.0.0" },
@@ -953,6 +961,11 @@ typing-stubs = [
     { name = "types-tqdm", specifier = "==4.67.0.20250513" },
 ]
 
+[[package]]
+name = "gigl-core"
+version = "0.2.0"
+source = { directory = "gigl-core" }
+
 [[package]]
 name = "google-api-core"
 version = "2.28.1"
@@ -2824,6 +2837,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" },
 ]
 
+[[package]]
+name = "pathspec"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/82/42f767fc1c1143d6fd36efb827202a2d997a375e160a71eb2888a925aac1/pathspec-1.1.1.tar.gz", hash = "sha256:17db5ecd524104a120e173814c90367a96a98d07c45b2e10c2f3919fff91bf5a", size = 135180, upload-time = "2026-04-27T01:46:08.907Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/d9/7fb5aa316bc299258e68c73ba3bddbc499654a07f151cba08f6153988714/pathspec-1.1.1-py3-none-any.whl", hash = "sha256:a00ce642f577bf7f473932318056212bc4f8bfdf53128c78bbd5af0b9b20b189", size = 57328, upload-time = "2026-04-27T01:46:07.06Z" },
+]
+
 [[package]]
 name = "pexpect"
 version = "4.9.0"
@@ -3511,6 +3533,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/58/ed/dea90a65b7d9e69888890fb14c90d7f51bf0c1e82ad800aeb0160e4bacfd/ruff-0.15.10-py3-none-win_arm64.whl", hash = "sha256:601d1610a9e1f1c2165a4f561eeaa2e2ea1e97f3287c5aa258d3dab8b57c6188", size = 11035607, upload-time = "2026-04-09T14:05:47.593Z" },
 ]
 
+[[package]]
+name = "scikit-build-core"
+version = "0.12.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "pathspec" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/28/cd/9ebb50029b6d8a3ee9e38cdce514ebd70190ec1edf28ab0a1f66d0b84670/scikit_build_core-0.12.2.tar.gz", hash = "sha256:562e0bbc9de1a354c87825ccf732080268d6582a0200f648e8c4a2dcb1e3736d", size = 303553, upload-time = "2026-03-05T18:25:57.666Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/49/b2f0fbe3165d55c02e7f9eec6a10685d518af0ef6e919ff2f589c2d15c85/scikit_build_core-0.12.2-py3-none-any.whl", hash = "sha256:6ea4730da400f9a998ec3287bd3ebc1d751fe45ad0a93451bead8618adbc02b1", size = 192625, upload-time = "2026-03-05T18:25:56.207Z" },
+]
+
 [[package]]
 name = "scikit-learn"
 version = "1.7.2"
@@ -4119,7 +4154,7 @@ dependencies = [
     { name = "typing-extensions", marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" },
 ]
 wheels = [
-    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:3d05017d19bc99741288e458888283a44b0ee881d53f05f72f8b1cfea8998122" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:3d05017d19bc99741288e458888283a44b0ee881d53f05f72f8b1cfea8998122", upload-time = "2025-10-01T23:35:48Z" },
 ]
 
 [[package]]
@@ -4139,11 +4174,11 @@ dependencies = [
     { name = "typing-extensions", marker = "(platform_machine != 'arm64' and extra == 'extra-4-gigl-pyg27-torch28-cpu') or (sys_platform != 'darwin' and extra == 'extra-4-gigl-pyg27-torch28-cpu') or (extra == 'extra-4-gigl-pyg27-torch28-cpu' and extra == 'extra-4-gigl-pyg27-torch28-cu128')" },
 ]
 wheels = [
-    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-linux_s390x.whl", hash = "sha256:2bfc013dd6efdc8f8223a0241d3529af9f315dffefb53ffa3bf14d3f10127da6" },
-    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:680129efdeeec3db5da3f88ee5d28c1b1e103b774aef40f9d638e2cce8f8d8d8" },
-    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cb06175284673a581dd91fb1965662ae4ecaba6e5c357aa0ea7bb8b84b6b7eeb" },
-    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:7631ef49fbd38d382909525b83696dc12a55d68492ade4ace3883c62b9fc140f" },
-    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_arm64.whl", hash = "sha256:41e6fc5ec0914fcdce44ccf338b1d19a441b55cafdd741fd0bf1af3f9e4cfd14" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-linux_s390x.whl", hash = "sha256:2bfc013dd6efdc8f8223a0241d3529af9f315dffefb53ffa3bf14d3f10127da6", upload-time = "2025-10-01T23:33:07Z" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:680129efdeeec3db5da3f88ee5d28c1b1e103b774aef40f9d638e2cce8f8d8d8", upload-time = "2025-10-01T23:33:11Z" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cb06175284673a581dd91fb1965662ae4ecaba6e5c357aa0ea7bb8b84b6b7eeb", upload-time = "2025-10-01T23:33:14Z" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:7631ef49fbd38d382909525b83696dc12a55d68492ade4ace3883c62b9fc140f", upload-time = "2025-10-01T23:33:20Z" },
+    { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp311-cp311-win_arm64.whl", hash = "sha256:41e6fc5ec0914fcdce44ccf338b1d19a441b55cafdd741fd0bf1af3f9e4cfd14", upload-time = "2025-10-01T23:33:36Z" },
 ]
 
 [[package]]
@@ -4174,8 +4209,8 @@ dependencies = [
     { name = "typing-extensions", marker = "extra == 'extra-4-gigl-pyg27-torch28-cu128'" },
 ]
 wheels = [
-    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:039b9dcdd6bdbaa10a8a5cd6be22c4cb3e3589a341e5f904cbb571ca28f55bed" },
-    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:34c55443aafd31046a7963b63d30bc3b628ee4a704f826796c865fdfd05bb596" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:039b9dcdd6bdbaa10a8a5cd6be22c4cb3e3589a341e5f904cbb571ca28f55bed", upload-time = "2025-10-01T23:49:06Z" },
+    { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.8.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:34c55443aafd31046a7963b63d30bc3b628ee4a704f826796c865fdfd05bb596", upload-time = "2025-10-01T23:49:30Z" },
 ]
 
 [[package]]
@@ -4330,7 +4365,7 @@ dependencies = [
     { name = "tqdm", marker = "sys_platform != 'darwin'" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cpu/torchrec-1.3.0%2Bcpu-py3-none-any.whl", hash = "sha256:be2b572625792feac1656afcac19e35448df5447d215575a4b8cb22d9220d2cf" },
+    { url = "https://download.pytorch.org/whl/cpu/torchrec-1.3.0%2Bcpu-py3-none-any.whl", hash = "sha256:be2b572625792feac1656afcac19e35448df5447d215575a4b8cb22d9220d2cf", upload-time = "2025-09-17T07:14:32Z" },
 ]
 
 [[package]]
@@ -4346,7 +4381,7 @@ dependencies = [
     { name = "tqdm" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cu128/torchrec-1.3.0%2Bcu128-py3-none-any.whl", hash = "sha256:6de7e4a70a6e95815a8f06b1dec4d982cea4d32fa7d86a10a8bb4c52b8a749b9" },
+    { url = "https://download.pytorch.org/whl/cu128/torchrec-1.3.0%2Bcu128-py3-none-any.whl", hash = "sha256:6de7e4a70a6e95815a8f06b1dec4d982cea4d32fa7d86a10a8bb4c52b8a749b9", upload-time = "2025-09-17T07:14:37Z" },
 ]
 
 [[package]]