Merge branch 'main' into pre-commit-ci-update-config

XkunW · web-flow · commit e1af7d147c27 · 2025-07-17T17:28:11.000-04:00
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -45,7 +45,7 @@ jobs:
           images: vectorinstitute/vector-inference
 
       - name: Build and push Docker image
-        uses: docker/build-push-action@1dc73863535b631f98b2378be8619f83b136f4a0
+        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83
         with:
           context: .
           file: ./Dockerfile
diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,13 @@
-FROM nvidia/cuda:12.4.1-devel-ubuntu20.04
+FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
 
 # Non-interactive apt-get commands
 ARG DEBIAN_FRONTEND=noninteractive
 
 # No GPUs visible during build
 ARG CUDA_VISIBLE_DEVICES=none
 
-# Specify CUDA architectures -> 7.5: RTX 6000 & T4, 8.0: A100, 8.6+PTX
-ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX"
+# Specify CUDA architectures -> 7.5: Quadro RTX 6000 & T4, 8.0: A100, 8.6: A40, 8.9: L40S, 9.0: H100
+ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX"
 
 # Set the Python version
 ARG PYTHON_VERSION=3.10.12
@@ -41,10 +41,6 @@ COPY . /vec-inf
 
 # Install project dependencies with build requirements
 RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu121" uv pip install --system -e .[dev]
-# Install FlashAttention
-RUN python3.10 -m pip install flash-attn --no-build-isolation
-# Install FlashInfer
-RUN python3.10 -m pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/
 
 # Final configuration
 RUN mkdir -p /vec-inf/nccl && \
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
 [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
 [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
-[![vLLM](https://img.shields.io/badge/vllm-0.8.5.post1-blue)](https://docs.vllm.ai/en/v0.8.5.post1/index.html)
+[![vLLM](https://img.shields.io/badge/vllm-0.9.2)](https://docs.vllm.ai/en/v0.9.2/index.html)
 ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
 
 This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, follow the instructions in [Installation](#installation).
@@ -18,7 +18,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
 ```bash
 pip install vec-inf
 ```
-Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.8.5.post1`.
+Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.9.2`.
 
 If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
 * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
diff --git a/docs/api.md b/docs/api.md
@@ -10,6 +10,15 @@ This section documents the Python API for vector-inference.
       show_root_full_path: true
       members: true
 
+## Model Config
+
+::: vec_inf.client.config.ModelConfig
+    options:
+      show_root_heading: true
+      show_root_full_path: true
+      members: true
+
+
 ## Data Models
 
 ::: vec_inf.client.models
diff --git a/docs/index.md b/docs/index.md
@@ -10,7 +10,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
 pip install vec-inf
 ```
 
-Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.8.5.post1`.
+Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.9.2`.
 
 If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
 * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "vec-inf"
-version = "0.6.1"
+version = "0.7.0"
 description = "Efficient LLM inference on Slurm clusters using vLLM."
 readme = "README.md"
 authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
@@ -40,8 +40,8 @@ docs = [
 [project.optional-dependencies]
 dev = [
     "xgrammar>=0.1.11",
-    "torch>=2.5.1",
-    "vllm>=0.7.3",
+    "torch>=2.7.0",
+    "vllm>=0.9.2",
     "vllm-nccl-cu12>=2.18,<2.19",
     "ray>=2.40.0",
     "cupy-cuda12x==12.1.0"
diff --git a/uv.lock b/uv.lock