@@ -29,6 +29,24 @@ Follow the instructions to set up the Conda environment:
2929#. :ref: `fbgemm-gpu.build.setup.tools.install `
3030#. :ref: `fbgemm-gpu.build.setup.pytorch.install `
3131
32+ Installing PyTorch for CUDA Builds
33+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34+
35+ For CUDA builds, install PyTorch with matching CUDA version support:
36+
37+ .. code :: sh
38+
39+ # !! Run inside the Conda environment !!
40+
41+ # For CUDA 12.9 with PyTorch nightly (recommended for latest features)
42+ pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu129
43+
44+ # For CUDA 12.8 with PyTorch stable
45+ pip install torch --index-url https://download.pytorch.org/whl/cu128
46+
47+ # Verify PyTorch installation
48+ python -c " import torch; print(f'PyTorch version: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}')"
49+
3250
3351 Other Pre-Build Setup
3452---------------------
@@ -57,6 +75,31 @@ Clone the repo along with its submodules, and install ``requirements_genai.txt``
5775 cd fbgemm_${FBGEMM_VERSION} /fbgemm_gpu
5876 pip install -r requirements_genai.txt
5977
78+ Initialize Git Submodules
79+ ~~~~~~~~~~~~~~~~~~~~~~~~~
80+
81+ FBGEMM GenAI relies on several submodules, including CUTLASS for optimized CUDA kernels.
82+ If you didn't use ``--recursive `` when cloning, initialize the submodules:
83+
84+ .. code :: sh
85+
86+ # Sync and initialize all submodules including CUTLASS
87+ git submodule sync
88+ git submodule update --init --recursive
89+
90+ # Verify CUTLASS is available
91+ ls external/cutlass/include
92+
93+ Install NCCL for Distributed Support
94+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
95+
96+ For distributed communication support, install NCCL via conda:
97+
98+ .. code :: sh
99+
100+ # !! Run inside the Conda environment !!
101+ conda install -c conda-forge nccl -y
102+
60103 Set Wheel Build Variables
61104~~~~~~~~~~~~~~~~~~~~~~~~~
62105
@@ -97,17 +140,53 @@ Similar to CPU-only builds, building with Clang + ``libstdc++`` can be enabled
97140by appending ``--cxxprefix=$CONDA_PREFIX `` to the build command, presuming the
98141toolchains have been properly installed.
99142
143+ Environment Setup for CUDA Builds
144+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145+
146+ Set up the necessary environment variables for a CUDA build:
147+
100148.. code :: sh
101149
102150 # !! Run in fbgemm_gpu/ directory inside the Conda environment !!
103151
104- # [OPTIONAL] Specify the CUDA installation paths
105- # This may be required if CMake is unable to find nvcc
106- export CUDACXX=/path/to/nvcc
107- export CUDA_BIN_PATH=/path/to/cuda/installation
152+ # Specify CUDA paths (adjust to your CUDA installation)
153+ export CUDA_HOME=" /usr/local/cuda"
154+ export CUDACXX=" ${CUDA_HOME} /bin/nvcc"
155+ export PATH=" ${CUDA_HOME} /bin:${PATH} "
156+ export LD_LIBRARY_PATH=" ${CUDA_HOME} /lib64:${CONDA_PREFIX} /lib:${LD_LIBRARY_PATH} "
157+
158+ # Specify NVML filepath (usually in CUDA stubs directory)
159+ export NVML_LIB_PATH=" ${CUDA_HOME} /lib64/stubs/libnvidia-ml.so"
160+
161+ # Specify NCCL filepath (installed via conda)
162+ export NCCL_LIB_PATH=" ${CONDA_PREFIX} /lib/libnccl.so"
163+
164+ CUDA Architecture Configuration
165+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
166+
167+ Configure the target CUDA architectures for your hardware:
168+
169+ .. code :: sh
170+
171+ # Build for SM70/80 (V100/A100 GPU); update as needed
172+ # If not specified, only the CUDA architecture supported by current system will be targeted
173+ # If not specified and no CUDA device is present either, all CUDA architectures will be targeted
174+ cuda_arch_list=7.0; 8.0
175+
176+ # For NVIDIA Blackwell architecture (GB100, GB200):
177+ # cuda_arch_list=10.0a
178+ # export TORCH_CUDA_ARCH_LIST="10.0a"
179+
180+ # Unset TORCH_CUDA_ARCH_LIST if it exists, bc it takes precedence over
181+ # -DTORCH_CUDA_ARCH_LIST during the invocation of setup.py
182+ unset TORCH_CUDA_ARCH_LIST
183+
184+ Optional NVCC Configuration
185+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
108186
109- # [OPTIONAL] Provide the CUB installation directory (applicable only to CUDA versions prior to 11.1)
110- export CUB_DIR=/path/to/cub
187+ Additional NVCC configuration options:
188+
189+ .. code :: sh
111190
112191 # [OPTIONAL] Allow NVCC to use host compilers that are newer than what NVCC officially supports
113192 nvcc_prepend_flags=(
@@ -126,24 +205,17 @@ toolchains have been properly installed.
126205 # [OPTIONAL] Enable verbose NVCC logs
127206 export NVCC_VERBOSE=1
128207
129- # Specify cuDNN header and library paths
130- export CUDNN_INCLUDE_DIR=/path/to/cudnn/include
131- export CUDNN_LIBRARY=/path/to/cudnn/lib
208+ Building the Package
209+ ~~~~~~~~~~~~~~~~~~~~
132210
133- # Specify NVML filepath
134- export NVML_LIB_PATH=/path/to/libnvidia-ml.so
135-
136- # Specify NCCL filepath
137- export NCCL_LIB_PATH=/path/to/libnccl.so.2
211+ .. code :: sh
138212
139- # Build for SM70/80 (V100/A100 GPU); update as needed
140- # If not specified, only the CUDA architecture supported by current system will be targeted
141- # If not specified and no CUDA device is present either, all CUDA architectures will be targeted
142- cuda_arch_list=7.0; 8.0
213+ # !! Run in fbgemm_gpu/ directory inside the Conda environment !!
143214
144- # Unset TORCH_CUDA_ARCH_LIST if it exists, bc it takes precedence over
145- # -DTORCH_CUDA_ARCH_LIST during the invocation of setup.py
146- unset TORCH_CUDA_ARCH_LIST
215+ # [OPTIONAL] Specify the CUDA installation paths
216+ # This may be required if CMake is unable to find nvcc
217+ export CUDACXX=/path/to/nvcc
218+ export CUDA_BIN_PATH=/path/to/cuda/installation
147219
148220 # Build the wheel artifact only
149221 python setup.py bdist_wheel \
@@ -215,3 +287,36 @@ Post-Build Checks (For Developers)
215287As FBGEMM GenAI leverages the same build process as FBGEMM_GPU, please refer to
216288:ref: `fbgemm-gpu.build.process.post-build ` for information on additional
217289post-build checks.
290+
291+ Troubleshooting Build Issues
292+ -----------------------------
293+
294+ Common Issues and Solutions
295+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
296+
297+ 1. **CUTLASS not found **: Ensure git submodules are initialized:
298+
299+ .. code :: sh
300+
301+ git submodule sync
302+ git submodule update --init --recursive
303+
304+ 2. **CUDA version mismatch **: Ensure PyTorch CUDA version matches your system CUDA:
305+
306+ .. code :: sh
307+
308+ # Check system CUDA version
309+ nvcc --version
310+
311+ # Check PyTorch CUDA version
312+ python -c " import torch; print(torch.version.cuda)"
313+
314+ 3. **NVML/NCCL library not found **: Verify the library paths are correct:
315+
316+ .. code :: sh
317+
318+ # Check NVML exists
319+ ls -la ${NVML_LIB_PATH}
320+
321+ # Check NCCL exists
322+ ls -la ${NCCL_LIB_PATH}
0 commit comments