From ebae6541d3ffeb7b7a84b0a1d9636672ad3a2dee Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Thu, 20 Nov 2025 23:38:59 +0530 Subject: [PATCH] Product Rename: TensorRT Model Optimizer to Model Optimizer Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/1_bug_report.md | 4 +- .github/ISSUE_TEMPLATE/3_question.md | 4 +- .github/PULL_REQUEST_TEMPLATE.md | 4 +- CHANGELOG-Windows.rst | 17 +++--- CHANGELOG.rst | 32 ++++++------ CONTRIBUTING.md | 8 +-- README.md | 52 +++++++++---------- docs/source/conf.py | 2 +- docs/source/deployment/2_directml.rst | 2 +- docs/source/deployment/3_unified_hf.rst | 2 +- docs/source/examples/0_all_examples.rst | 2 +- docs/source/getting_started/1_overview.rst | 8 +-- .../windows/_installation_for_Windows.rst | 4 +- .../windows/_installation_standalone.rst | 2 +- .../windows/_installation_with_olive.rst | 14 ++--- docs/source/guides/0_support_matrix.rst | 4 +- docs/source/guides/1_quantization.rst | 2 +- docs/source/guides/3_pruning.rst | 4 +- docs/source/guides/7_nas.rst | 6 +-- .../guides/_customized_model_quantization.rst | 2 +- .../guides/windows_guides/_ONNX_PTQ_guide.rst | 6 +-- docs/source/index.rst | 16 +++--- docs/source/support/1_contact.rst | 2 +- examples/benchmark.md | 4 +- examples/cnn_qat/README.md | 4 +- examples/diffusers/README.md | 28 +++++----- examples/diffusers/quantization/quantize.py | 2 +- examples/gpt-oss/README.md | 8 +-- examples/llm_autodeploy/README.md | 2 +- examples/llm_distill/README.md | 20 +++---- examples/llm_eval/README.md | 4 +- examples/llm_ptq/README.md | 32 ++++++------ .../1_FP4-FP8_PTQ_Min-Max_Calibration.ipynb | 10 ++-- .../notebooks/2_PTQ_AWQ_Calibration.ipynb | 6 +-- .../notebooks/3_PTQ_AutoQuantization.ipynb | 4 +- examples/llm_qat/README.md | 34 ++++++------ .../notebooks/QAT_QAD_Walkthrough.ipynb | 12 ++--- examples/llm_sparsity/README.md | 4 +- examples/model_hub/README.md | 2 +- examples/nemo_run/prune_distill/README.md | 6 +-- examples/nemo_run/qat/README.md | 6 +-- examples/nemo_run/qat/nemo_qat_flow.py | 2 +- examples/onnx_ptq/README.md | 16 +++--- examples/pruning/README.md | 14 ++--- examples/pruning/cifar_resnet.ipynb | 6 +-- examples/speculative_decoding/README.md | 24 ++++----- examples/speculative_decoding/example.ipynb | 4 +- examples/vllm_serve/Dockerfile | 10 ++-- examples/vlm_ptq/README.md | 16 +++--- examples/windows/Benchmark.md | 4 +- examples/windows/README.md | 22 ++++---- examples/windows/onnx_ptq/genai_llm/README.md | 2 +- modelopt/__init__.py | 2 +- .../torch/quantization/plugins/huggingface.py | 2 +- .../plugins/transformers_trainer.py | 2 +- setup.py | 6 +-- tests/examples/README.md | 6 +-- 57 files changed, 262 insertions(+), 263 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_bug_report.md b/.github/ISSUE_TEMPLATE/1_bug_report.md index c1f0cd02d..992b0f0f9 100644 --- a/.github/ISSUE_TEMPLATE/1_bug_report.md +++ b/.github/ISSUE_TEMPLATE/1_bug_report.md @@ -6,7 +6,7 @@ labels: bug assignees: '' --- -**Before submitting an issue, please make sure it hasn't been already addressed by searching through the [existing and past issues](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues?q=is%3Aissue).** +**Before submitting an issue, please make sure it hasn't been already addressed by searching through the [existing and past issues](https://github.com/NVIDIA/Model-Optimizer/issues?q=is%3Aissue).** ## Describe the bug @@ -30,7 +30,7 @@ If you are unsure about whom to tag, you can leave it blank, and we will make su ## System information - + - Container used (if applicable): ? - OS (e.g., Ubuntu 22.04, CentOS 7, Windows 10): ? diff --git a/.github/ISSUE_TEMPLATE/3_question.md b/.github/ISSUE_TEMPLATE/3_question.md index a191bfde2..64886b648 100644 --- a/.github/ISSUE_TEMPLATE/3_question.md +++ b/.github/ISSUE_TEMPLATE/3_question.md @@ -6,7 +6,7 @@ labels: question assignees: '' --- -Make sure you already checked the [examples](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples) and [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) before submitting an issue. +Make sure you already checked the [examples](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples) and [documentation](https://nvidia.github.io/Model-Optimizer/) before submitting an issue. ## How would you like to use ModelOpt @@ -23,7 +23,7 @@ If you are unsure about whom to tag, you can leave it blank, and we will make su ## System information - + - Container used (if applicable): ? - OS (e.g., Ubuntu 22.04, CentOS 7, Windows 10): ? diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6955b85ca..e3c4dd451 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -17,11 +17,11 @@ ## Before your PR is "*Ready for review*" -- **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. +- **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No -- **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No +- **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No ## Additional Information diff --git a/CHANGELOG-Windows.rst b/CHANGELOG-Windows.rst index 960cddbfb..cea2aac1d 100644 --- a/CHANGELOG-Windows.rst +++ b/CHANGELOG-Windows.rst @@ -1,13 +1,12 @@ -=================================== -Model Optimizer Changelog (Windows) -=================================== +NVIDIA Model Optimizer Changelog (Windows) +========================================== 0.33 (2025-07-21) ^^^^^^^^^^^^^^^^^ **New Features** -- TensorRT Model Optimizer for Windows now supports `NvTensorRtRtx `_ execution-provider. +- Model Optimizer for Windows now supports `NvTensorRtRtx `_ execution-provider. 0.27 (2025-04-30) @@ -15,8 +14,8 @@ Model Optimizer Changelog (Windows) **New Features** -- New LLM models like DeepSeek etc. are supported with ONNX INT4 AWQ quantization on Windows. Refer `Windows Support Matrix `_ for details about supported features and models. -- TensorRT Model Optimizer for Windows now supports ONNX INT8 and FP8 quantization (W8A8) of SAM2 and Whisper models. Check `example scripts `_ for getting started with quantizing these models. +- New LLM models like DeepSeek etc. are supported with ONNX INT4 AWQ quantization on Windows. Refer `Windows Support Matrix `_ for details about supported features and models. +- Model Optimizer for Windows now supports ONNX INT8 and FP8 quantization (W8A8) of SAM2 and Whisper models. Check `example scripts `_ for getting started with quantizing these models. 0.19 (2024-11-18) @@ -24,11 +23,11 @@ Model Optimizer Changelog (Windows) **New Features** -- This is the first official release of TensorRT Model Optimizer for Windows +- This is the first official release of Model Optimizer for Windows - **ONNX INT4 Quantization:** :meth:`modelopt.onnx.quantization.quantize_int4 ` now supports ONNX INT4 quantization for DirectML and TensorRT* deployment. See :ref:`Support_Matrix` for details about supported features and models. -- **LLM Quantization with Olive:** Enabled LLM quantization through Olive, streamlining model optimization workflows. Refer `example `_ +- **LLM Quantization with Olive:** Enabled LLM quantization through Olive, streamlining model optimization workflows. Refer `example `_ - **DirectML Deployment Guide:** Added DML deployment guide. Refer :ref:`DirectML_Deployment`. -- **MMLU Benchmark for Accuracy Evaluations:** Introduced `MMLU benchmarking `_ for accuracy evaluation of ONNX models on DirectML (DML). +- **MMLU Benchmark for Accuracy Evaluations:** Introduced `MMLU benchmarking `_ for accuracy evaluation of ONNX models on DirectML (DML). - **Published quantized ONNX models collection:** Published quantized ONNX models at HuggingFace `NVIDIA collections `_. diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b5825d6e1..46338fa3e 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,5 +1,5 @@ -Model Optimizer Changelog (Linux) -================================= +NVIDIA Model Optimizer Changelog (Linux) +======================================== 0.40 (2025-12-11) ^^^^^^^^^^^^^^^^^ @@ -12,7 +12,7 @@ Model Optimizer Changelog (Linux) **New Features** - Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``). -- Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md `_ for more details. +- Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md `_ for more details. - Add FP8/NVFP4 KV cache quantization support for Megatron Core models. - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow. - Add support for PyTorch Geometric quantization. @@ -30,7 +30,7 @@ Model Optimizer Changelog (Linux) **Deprecations** -- Deprecated ``modelopt.torch._deploy.utils.get_onnx_bytes`` API. Please use ``modelopt.torch._deploy.utils.get_onnx_bytes_and_metadata`` instead to access the ONNX model bytes with external data. see `examples/onnx_ptq/download_example_onnx.py `_ for example usage. +- Deprecated ``modelopt.torch._deploy.utils.get_onnx_bytes`` API. Please use ``modelopt.torch._deploy.utils.get_onnx_bytes_and_metadata`` instead to access the ONNX model bytes with external data. see `examples/onnx_ptq/download_example_onnx.py `_ for example usage. **New Features** @@ -40,7 +40,7 @@ Model Optimizer Changelog (Linux) - Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` (gated dataset accessed using ``HF_TOKEN`` environment variable) if no dataset is specified. - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration. - Add support for MCore MoE PTQ/QAT/QAD. -- Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md `_ for more details. +- Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md `_ for more details. - Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow. - Add flags ``nodes_to_include`` and ``op_types_to_include`` in AutoCast to force-include nodes in low precision, even if they would otherwise be excluded by other rules. - Add support for ``torch.compile`` and benchmarking in ``examples/diffusers/quantization/diffusion_trt.py``. @@ -51,15 +51,15 @@ Model Optimizer Changelog (Linux) **Documentation** -- Add general guidelines for Minitron pruning and distillation. See `examples/pruning/README.md `_ for more details. -- Added example for exporting QLoRA checkpoint for vLLM deployment. Refer to `examples/llm_qat/README.md `_ for more details +- Add general guidelines for Minitron pruning and distillation. See `examples/pruning/README.md `_ for more details. +- Added example for exporting QLoRA checkpoint for vLLM deployment. Refer to `examples/llm_qat/README.md `_ for more details 0.37 (2025-10-08) ^^^^^^^^^^^^^^^^^ **Deprecations** -- Deprecated ModelOpt's custom docker images. Please use the PyTorch, TensorRT-LLM or TensorRT docker image directly or refer to the `installation guide `_ for more details. +- Deprecated ModelOpt's custom docker images. Please use the PyTorch, TensorRT-LLM or TensorRT docker image directly or refer to the `installation guide `_ for more details. - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. - Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. ``engine_dir`` is replaced with ``checkpoint_dir`` in ``examples/llm_ptq`` and ``examples/vlm_ptq``. For performance evaluation, please use ``trtllm-bench`` directly. - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format. @@ -226,8 +226,8 @@ Model Optimizer Changelog (Linux) - Disabled saving modelopt state in unified hf export APIs by default, i.e., added ``save_modelopt_state`` flag in ``export_hf_checkpoint`` API and by default set to False. - Add FP8 and NVFP4 real quantization support with LLM QLoRA example. - The :class:`modelopt.deploy.llm.LLM` now support use the :class:`tensorrt_llm._torch.LLM` backend for the quantized HuggingFace checkpoints. -- Add `NVFP4 PTQ example for DeepSeek-R1 `_. -- Add end-to-end `AutoDeploy example for AutoQuant LLM models `_. +- Add `NVFP4 PTQ example for DeepSeek-R1 `_. +- Add end-to-end `AutoDeploy example for AutoQuant LLM models `_. 0.23 (2025-01-29) ^^^^^^^^^^^^^^^^^ @@ -235,7 +235,7 @@ Model Optimizer Changelog (Linux) **Backward Breaking Changes** - Support TensorRT-LLM to 0.17. Examples (e.g. benchmark task in llm_ptq) may not be fully compatible with TensorRT-LLM 0.15. -- Nvidia TensorRT Model Optimizer has changed its LICENSE from NVIDIA Proprietary (library wheel) and MIT (examples) to Apache 2.0 in this first full OSS release. +- Nvidia Model Optimizer has changed its LICENSE from NVIDIA Proprietary (library wheel) and MIT (examples) to Apache 2.0 in this first full OSS release. - Deprecate Python 3.8, Torch 2.0, and Cuda 11.x support. - ONNX Runtime dependency upgraded to 1.20 which no longer supports Python 3.9. - In the Huggingface examples, the ``trust_remote_code`` is by default set to false and require users to explicitly turning it on with ``--trust_remote_code`` flag. @@ -283,7 +283,7 @@ Model Optimizer Changelog (Linux) **Backward Breaking Changes** - Deprecated the summarize task in the ``llm_ptq`` example. -- Deprecated the ``type`` flag in the `huggingface_example.sh `_ +- Deprecated the ``type`` flag in the `huggingface_example.sh `_ - Deprecated Python plugin support in ONNX. - Support TensorRT-LLM 0.13. Examples not compatible with TensorRT-LLM 0.12. - :meth:`mtq.auto_quantize ` API has been updated. The API now @@ -320,7 +320,7 @@ Model Optimizer Changelog (Linux) - New APIs and examples: :mod:`modelopt.torch.prune` for pruning Conv, Linear, and Attention heads for NVIDIA Megatron-core GPT-style models (e.g. Llama 3), PyTorch Computer Vision models, and HuggingFace Bert/GPT-J models. - New API: :mod:`modelopt.torch.distill` for knowledge distillation, along with guides and example. -- New Example: `HF BERT Prune, Distill & Quantize `_ +- New Example: `HF BERT Prune, Distill & Quantize `_ showcasing how to chain pruning, distillation, and quantization to achieve the best performance on a given model. - Added INT8/FP8 DQ-only support for ONNX model. - New API: :mod:`modelopt.torch.speculative` for end-to-end support of Medusa models. @@ -383,13 +383,13 @@ Model Optimizer Changelog (Linux) **Backward Breaking Changes** -- `PTQ examples `_ have been +- `PTQ examples `_ have been upgraded to use TensorRT-LLM 0.10. **New Features** - Adding TensorRT-LLM checkpoint export support for Medusa decoding (official ``MedusaModel`` and Megatron Core ``GPTModel``). -- Enable support for mixtral, recurrentgemma, starcoder, qwen in `PTQ examples `_. +- Enable support for mixtral, recurrentgemma, starcoder, qwen in `PTQ examples `_. - Adding TensorRT-LLM checkpoint export and engine building support for sparse models. - Import scales from TensorRT calibration cache and use them for quantization. - (Experimental) Enable low GPU memory FP8 calibration for the Hugging Face models when the original model size does not fit into the GPU memory. @@ -403,7 +403,7 @@ Model Optimizer Changelog (Linux) **Backward Breaking Changes** - [!!!] The package was renamed from ``ammo`` to ``modelopt``. The new full product - name is *Nvidia TensorRT Model Optimizer*. PLEASE CHANGE ALL YOUR REFERENCES FROM ``ammo`` to + name is *Nvidia Model Optimizer*. PLEASE CHANGE ALL YOUR REFERENCES FROM ``ammo`` to ``modelopt`` including any paths and links! - Default installation ``pip install nvidia-modelopt`` will now only install minimal core dependencies. Following optional dependencies are available depending on the features that are diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 525689764..3ace50ada 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,17 +1,17 @@ -# Contributing to TensorRT Model Optimizer +# Contributing to Model Optimizer -Thanks for your interest in contributing to TensorRT Model Optimizer (ModelOpt)! +Thanks for your interest in contributing to Model Optimizer (ModelOpt)! ## πŸ› οΈ Setting up your environment -Ensure that TensorRT Model Optimizer (ModelOpt) is installed in editable mode and that all `dev` optional requirements are installed: +Ensure that Model Optimizer (ModelOpt) is installed in editable mode and that all `dev` optional requirements are installed: ```bash pip install -e ".[dev]" ``` If you are working on features that require dependencies like TensorRT-LLM or Megatron-Core, consider using a docker container to simplify the setup process. -Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. +Visit our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more information. ## 🧹 Code linting and formatting diff --git a/README.md b/README.md index 9f874ada9..b409cbad5 100644 --- a/README.md +++ b/README.md @@ -2,20 +2,20 @@ ![Banner image](docs/source/assets/model-optimizer-banner.png) -# NVIDIA TensorRT Model Optimizer +# NVIDIA Model Optimizer -[![Documentation](https://img.shields.io/badge/Documentation-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-Model-Optimizer) +[![Documentation](https://img.shields.io/badge/Documentation-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/Model-Optimizer) [![version](https://img.shields.io/pypi/v/nvidia-modelopt?label=Release)](https://pypi.org/project/nvidia-modelopt/) [![license](https://img.shields.io/badge/License-Apache%202.0-blue)](./LICENSE) -[Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) | -[Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) +[Documentation](https://nvidia.github.io/Model-Optimizer) | +[Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) ______________________________________________________________________ -**NVIDIA TensorRT Model Optimizer** (referred to as **Model Optimizer**, or **ModelOpt**) is a library comprising state-of-the-art model optimization [techniques](#techniques) including quantization, distillation, pruning, speculative decoding and sparsity to accelerate models. +**NVIDIA Model Optimizer** (referred to as **Model Optimizer**, or **ModelOpt**) is a library comprising state-of-the-art model optimization [techniques](#techniques) including quantization, distillation, pruning, speculative decoding and sparsity to accelerate models. **[Input]** Model Optimizer currently supports inputs of a [Hugging Face](https://huggingface.co/), [PyTorch](https://github.com/pytorch/pytorch) or [ONNX](https://github.com/onnx/onnx) model. @@ -26,28 +26,28 @@ Model Optimizer is also integrated with [NVIDIA NeMo](https://github.com/NVIDIA- ## Latest News -- [2025/10/07] [Pruning and Distilling LLMs Using NVIDIA TensorRT Model Optimizer](https://developer.nvidia.com/blog/pruning-and-distilling-llms-using-nvidia-tensorrt-model-optimizer/) +- [2025/10/07] [Pruning and Distilling LLMs Using NVIDIA Model Optimizer](https://developer.nvidia.com/blog/pruning-and-distilling-llms-using-nvidia-tensorrt-model-optimizer/) - [2025/09/17] [An Introduction to Speculative Decoding for Reducing Latency in AI Inference](https://developer.nvidia.com/blog/an-introduction-to-speculative-decoding-for-reducing-latency-in-ai-inference/) - [2025/09/11] [How Quantization Aware Training Enables Low-Precision Accuracy Recovery](https://developer.nvidia.com/blog/how-quantization-aware-training-enables-low-precision-accuracy-recovery/) - [2025/08/29] [Fine-Tuning gpt-oss for Accuracy and Performance with Quantization Aware Training](https://developer.nvidia.com/blog/fine-tuning-gpt-oss-for-accuracy-and-performance-with-quantization-aware-training/) - [2025/08/01] [Optimizing LLMs for Performance and Accuracy with Post-Training Quantization](https://developer.nvidia.com/blog/optimizing-llms-for-performance-and-accuracy-with-post-training-quantization/) - [2025/06/24] [Introducing NVFP4 for Efficient and Accurate Low-Precision Inference](https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/) - [2025/05/14] [NVIDIA TensorRT Unlocks FP4 Image Generation for NVIDIA Blackwell GeForce RTX 50 Series GPUs](https://developer.nvidia.com/blog/nvidia-tensorrt-unlocks-fp4-image-generation-for-nvidia-blackwell-geforce-rtx-50-series-gpus/) -- [2025/04/21] [Adobe optimized deployment using TensorRT-Model-Optimizer + TensorRT leading to a 60% reduction in diffusion latency, a 40% reduction in total cost of ownership](https://developer.nvidia.com/blog/optimizing-transformer-based-diffusion-models-for-video-generation-with-nvidia-tensorrt/) +- [2025/04/21] [Adobe optimized deployment using Model-Optimizer + TensorRT leading to a 60% reduction in diffusion latency, a 40% reduction in total cost of ownership](https://developer.nvidia.com/blog/optimizing-transformer-based-diffusion-models-for-video-generation-with-nvidia-tensorrt/) - [2025/04/05] [NVIDIA Accelerates Inference on Meta Llama 4 Scout and Maverick](https://developer.nvidia.com/blog/nvidia-accelerates-inference-on-meta-llama-4-scout-and-maverick/). Check out how to quantize Llama4 for deployment acceleration [here](./examples/llm_ptq/README.md#llama-4) - [2025/03/18] [World's Fastest DeepSeek-R1 Inference with Blackwell FP4 & Increasing Image Generation Efficiency on Blackwell](https://developer.nvidia.com/blog/nvidia-blackwell-delivers-world-record-deepseek-r1-inference-performance/) - [2025/02/25] Model Optimizer quantized NVFP4 models available on Hugging Face for download: [DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), [Llama-3.3-70B-Instruct-FP4](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP4), [Llama-3.1-405B-Instruct-FP4](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP4) - [2025/01/28] Model Optimizer has added support for NVFP4. Check out an example of NVFP4 PTQ [here](./examples/llm_ptq/README.md#model-quantization-and-trt-llm-conversion). - [2025/01/28] Model Optimizer is now open source! - [2024/10/23] Model Optimizer quantized FP8 Llama-3.1 Instruct models available on Hugging Face for download: [8B](https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8), [70B](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8), [405B](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8). -- [2024/09/10] [Post-Training Quantization of LLMs with NVIDIA NeMo and TensorRT Model Optimizer](https://developer.nvidia.com/blog/post-training-quantization-of-llms-with-nvidia-nemo-and-nvidia-tensorrt-model-optimizer/). +- [2024/09/10] [Post-Training Quantization of LLMs with NVIDIA NeMo and Model Optimizer](https://developer.nvidia.com/blog/post-training-quantization-of-llms-with-nvidia-nemo-and-nvidia-tensorrt-model-optimizer/).
Previous News -- [2024/08/28] [Boosting Llama 3.1 405B Performance up to 44% with TensorRT Model Optimizer on NVIDIA H200 GPUs](https://developer.nvidia.com/blog/boosting-llama-3-1-405b-performance-by-up-to-44-with-nvidia-tensorrt-model-optimizer-on-nvidia-h200-gpus/) +- [2024/08/28] [Boosting Llama 3.1 405B Performance up to 44% with Model Optimizer on NVIDIA H200 GPUs](https://developer.nvidia.com/blog/boosting-llama-3-1-405b-performance-by-up-to-44-with-nvidia-tensorrt-model-optimizer-on-nvidia-h200-gpus/) - [2024/08/28] [Up to 1.9X Higher Llama 3.1 Performance with Medusa](https://developer.nvidia.com/blog/low-latency-inference-chapter-1-up-to-1-9x-higher-llama-3-1-performance-with-medusa-on-nvidia-hgx-h200-with-nvlink-switch/) -- [2024/08/15] New features in recent releases: [Cache Diffusion](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/diffusers/cache_diffusion), [QLoRA workflow with NVIDIA NeMo](https://docs.nvidia.com/nemo-framework/user-guide/24.09/sft_peft/qlora.html), and more. Check out [our blog](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/) for details. +- [2024/08/15] New features in recent releases: [Cache Diffusion](./examples/diffusers/cache_diffusion), [QLoRA workflow with NVIDIA NeMo](https://docs.nvidia.com/nemo-framework/user-guide/24.09/sft_peft/qlora.html), and more. Check out [our blog](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/) for details. - [2024/06/03] Model Optimizer now has an experimental feature to deploy to vLLM as part of our effort to support popular deployment frameworks. Check out the workflow [here](./examples/llm_ptq/README.md#deploy-fp8-quantized-model-using-vllm) - [2024/05/08] [Announcement: Model Optimizer Now Formally Available to Further Accelerate GenAI Inference Performance](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/) - [2024/03/27] [Model Optimizer supercharges TensorRT-LLM to set MLPerf LLM inference records](https://developer.nvidia.com/blog/nvidia-h200-tensor-core-gpus-and-nvidia-tensorrt-llm-set-mlperf-llm-inference-records/) @@ -69,8 +69,8 @@ To install from source in editable mode with all development dependencies or to ```bash # Clone the Model Optimizer repository -git clone git@github.com:NVIDIA/TensorRT-Model-Optimizer.git -cd TensorRT-Model-Optimizer +git clone git@github.com:NVIDIA/Model-Optimizer.git +cd Model-Optimizer pip install -e .[dev] ``` @@ -78,7 +78,7 @@ pip install -e .[dev] You can also directly use the [TensorRT-LLM docker images](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) (e.g., `nvcr.io/nvidia/tensorrt-llm/release:`), which have Model Optimizer pre-installed. Make sure to upgrade Model Optimizer to the latest version using ``pip`` as described above. -Visit our [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for +Visit our [installation guide](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more fine-grained control on installed dependencies or for alternative docker images and environment variables to setup. ## Techniques @@ -87,29 +87,29 @@ more fine-grained control on installed dependencies or for alternative docker im | **Technique** | **Description** | **Examples** | **Docs** | | :------------: | :------------: | :------------: | :------------: | -| Post Training Quantization | Compress model size by 2x-4x, speeding up inference while preserving model quality! | \[[LLMs](./examples/llm_ptq/)\] \[[diffusers](./examples/diffusers/)\] \[[VLMs](./examples/vlm_ptq/)\] \[[onnx](./examples/onnx_ptq/)\] \[[windows](./examples/windows/)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | -| Quantization Aware Training | Refine accuracy even further with a few training steps! | \[[NeMo](./examples/llm_qat#nemo-qatqad-simplified-flow-example)\] \[[Hugging Face](./examples/llm_qat/)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | -| Pruning | Reduce your model size and accelerate inference by removing unnecessary weights! | \[[PyTorch](./examples/pruning/)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/3_pruning.html)\] | -| Distillation | Reduce deployment model size by teaching small models to behave like larger models! | \[[NeMo](./examples/llm_distill#knowledge-distillation-kd-for-nvidia-nemo-models)\] \[[Hugging Face](./examples/llm_distill/)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html)\] | -| Speculative Decoding | Train draft modules to predict extra tokens during inference! | \[[Megatron](./examples/speculative_decoding#mlm-example)\] \[[Hugging Face](./examples/speculative_decoding/)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/5_speculative_decoding.html)\] | -| Sparsity | Efficiently compress your model by storing only its non-zero parameter values and their locations | \[[PyTorch](./examples/llm_sparsity/)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/6_sparsity.html)\] | +| Post Training Quantization | Compress model size by 2x-4x, speeding up inference while preserving model quality! | \[[LLMs](./examples/llm_ptq/)\] \[[diffusers](./examples/diffusers/)\] \[[VLMs](./examples/vlm_ptq/)\] \[[onnx](./examples/onnx_ptq/)\] \[[windows](./examples/windows/)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | +| Quantization Aware Training | Refine accuracy even further with a few training steps! | \[[NeMo](./examples/llm_qat#nemo-qatqad-simplified-flow-example)\] \[[Hugging Face](./examples/llm_qat/)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | +| Pruning | Reduce your model size and accelerate inference by removing unnecessary weights! | \[[PyTorch](./examples/pruning/)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/3_pruning.html)\] | +| Distillation | Reduce deployment model size by teaching small models to behave like larger models! | \[[NeMo](./examples/llm_distill#knowledge-distillation-kd-for-nvidia-nemo-models)\] \[[Hugging Face](./examples/llm_distill/)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/4_distillation.html)\] | +| Speculative Decoding | Train draft modules to predict extra tokens during inference! | \[[Megatron](./examples/speculative_decoding#mlm-example)\] \[[Hugging Face](./examples/speculative_decoding/)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/5_speculative_decoding.html)\] | +| Sparsity | Efficiently compress your model by storing only its non-zero parameter values and their locations | \[[PyTorch](./examples/llm_sparsity/)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/6_sparsity.html)\] | ## Pre-Quantized Checkpoints -- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia TensorRT Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] +- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] - Deployable on [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang) - More models coming soon! ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](./examples/benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) ## Model Support Matrix @@ -132,6 +132,6 @@ Please read our [Contributing](./CONTRIBUTING.md) guidelines for details on how ### Top Contributors -[![Contributors](https://contrib.rocks/image?repo=NVIDIA/TensorRT-Model-Optimizer)](https://github.com/NVIDIA/TensorRT-Model-Optimizer/graphs/contributors) +[![Contributors](https://contrib.rocks/image?repo=NVIDIA/Model-Optimizer)](https://github.com/NVIDIA/Model-Optimizer/graphs/contributors) Happy optimizing! diff --git a/docs/source/conf.py b/docs/source/conf.py index 36046172e..dbf66c18d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,7 +46,7 @@ # -- Project information ----------------------------------------------------- -project = "TensorRT Model Optimizer" # pylint: disable=C0103 +project = "Model Optimizer" # pylint: disable=C0103 copyright = "2023-2025, NVIDIA Corporation" # pylint: disable=C0103 author = "NVIDIA Corporation" # pylint: disable=C0103 version = __version__ diff --git a/docs/source/deployment/2_directml.rst b/docs/source/deployment/2_directml.rst index 314746028..90a4a31a9 100644 --- a/docs/source/deployment/2_directml.rst +++ b/docs/source/deployment/2_directml.rst @@ -5,7 +5,7 @@ DirectML =================== -Once an ONNX FP16 model is quantized using TensorRT Model Optimizer on Windows, the resulting quantized ONNX model can be deployed on the DirectML (DML) backend via the `ONNX Runtime GenAI `_ or `ONNX Runtime `_. +Once an ONNX FP16 model is quantized using Model Optimizer on Windows, the resulting quantized ONNX model can be deployed on the DirectML (DML) backend via the `ONNX Runtime GenAI `_ or `ONNX Runtime `_. .. note:: Currently, DirectML backend doesn't support 8-bit precision. So, 8-bit quantized models should be deployed on other backends like ORT-CUDA etc. However, DML path does support INT4 quantized models. diff --git a/docs/source/deployment/3_unified_hf.rst b/docs/source/deployment/3_unified_hf.rst index 6671c98a6..ee8c5bc9b 100644 --- a/docs/source/deployment/3_unified_hf.rst +++ b/docs/source/deployment/3_unified_hf.rst @@ -6,7 +6,7 @@ We support exporting modelopt-optimized Huggingface models and Megatron Core mod The workflow is as follows: -#. Load the Huggingface models or Megatron Core models, `quantize with modelopt `_ , and export to the unified checkpoint format, where the layer structures and tensor names are aligned with the original checkpoint. +#. Load the Huggingface models or Megatron Core models, `quantize with modelopt `_ , and export to the unified checkpoint format, where the layer structures and tensor names are aligned with the original checkpoint. #. Load the unified checkpoint in the supported inference framework for accelerated inference. diff --git a/docs/source/examples/0_all_examples.rst b/docs/source/examples/0_all_examples.rst index a51e316bc..690fc9869 100644 --- a/docs/source/examples/0_all_examples.rst +++ b/docs/source/examples/0_all_examples.rst @@ -3,4 +3,4 @@ All GitHub Examples All examples related to optimization techniques like Quantization (PTQ, QAT), Sparsity, Distillation, Pruning, TensorRT-LLM deployment and more can be accessed from the ModelOpt GitHub repository at -`github.com/NVIDIA/TensorRT-Model-Optimizer `_. +`github.com/NVIDIA/Model-Optimizer `_. diff --git a/docs/source/getting_started/1_overview.rst b/docs/source/getting_started/1_overview.rst index e8ae04a4e..698819ba5 100644 --- a/docs/source/getting_started/1_overview.rst +++ b/docs/source/getting_started/1_overview.rst @@ -2,18 +2,18 @@ Overview ======== -**NVIDIA TensorRT Model Optimizer** +**NVIDIA Model Optimizer** =================================== Minimizing inference costs presents a significant challenge as generative AI models continue to grow in complexity and size. -The `NVIDIA TensorRT Model Optimizer `_ (referred to as Model Optimizer, or ModelOpt) +The `NVIDIA Model Optimizer `_ (referred to as Model Optimizer, or ModelOpt) is a library comprising state-of-the-art model optimization techniques including quantization and sparsity to compress model. It accepts a torch or ONNX model as input and provides Python APIs for users to easily stack different model optimization techniques to produce optimized & quantized checkpoints. Seamlessly integrated within the NVIDIA AI software ecosystem, the quantized checkpoint generated from Model Optimizer is ready for deployment in downstream inference frameworks like `TensorRT-LLM `_ or `TensorRT `_ (Linux). ModelOpt is integrated with `NVIDIA NeMo `_ and `Megatron-LM `_ for training-in-the-loop optimization techniques. For enterprise users, the 8-bit quantization with Stable Diffusion is also available on `NVIDIA NIM `_. -For Windows users, the `TensorRT Model Optimizer for Windows `_ (ModelOpt-Windows) delivers model compression techniques, including quantization, on Windows RTX PC systems. ModelOpt-Windows is optimized for efficient quantization, featuring local GPU calibration, reduced system and video memory consumption, and swift processing times. It integrates seamlessly with the Windows ecosystem, with optimized ONNX models as output for `Microsoft DirectML `_ backends. Furthermore, ModelOpt-Windows supports SDKs such as `Microsoft Olive `_ and `ONNX Runtime `_, enabling the deployment of quantized models across various independent hardware vendors through the DirectML path. +For Windows users, the `Model Optimizer for Windows `_ (ModelOpt-Windows) delivers model compression techniques, including quantization, on Windows RTX PC systems. ModelOpt-Windows is optimized for efficient quantization, featuring local GPU calibration, reduced system and video memory consumption, and swift processing times. It integrates seamlessly with the Windows ecosystem, with optimized ONNX models as output for `Microsoft DirectML `_ backends. Furthermore, ModelOpt-Windows supports SDKs such as `Microsoft Olive `_ and `ONNX Runtime `_, enabling the deployment of quantized models across various independent hardware vendors through the DirectML path. -TensorRT Model Optimizer for both Linux and Windows are available for free for all developers on `NVIDIA PyPI `_. Visit the `TensorRT Model Optimizer GitHub repository `_ for end-to-end +Model Optimizer for both Linux and Windows are available for free for all developers on `NVIDIA PyPI `_. Visit the `Model Optimizer GitHub repository `_ for end-to-end example scripts and recipes optimized for NVIDIA GPUs. Techniques diff --git a/docs/source/getting_started/windows/_installation_for_Windows.rst b/docs/source/getting_started/windows/_installation_for_Windows.rst index bb729c46e..a386fd30f 100644 --- a/docs/source/getting_started/windows/_installation_for_Windows.rst +++ b/docs/source/getting_started/windows/_installation_for_Windows.rst @@ -6,7 +6,7 @@ Installation for Windows **System Requirements** -The following system requirements are necessary to install and use TensorRT Model Optimizer - Windows: +The following system requirements are necessary to install and use Model Optimizer - Windows: +-------------------------+-----------------------------+ | OS | Windows | @@ -28,7 +28,7 @@ The following system requirements are necessary to install and use TensorRT Mode - Make sure to use GPU-compatible driver and other dependencies (e.g. torch etc.). For instance, support for Blackwell GPU might be present in Nvidia 570+ driver, and CUDA-12.8. - We currently support *Single-GPU* configuration. -The TensorRT Model Optimizer - Windows can be used in following ways: +The Model Optimizer - Windows can be used in following ways: .. toctree:: :glob: diff --git a/docs/source/getting_started/windows/_installation_standalone.rst b/docs/source/getting_started/windows/_installation_standalone.rst index e5b8e4c9d..47f36050c 100644 --- a/docs/source/getting_started/windows/_installation_standalone.rst +++ b/docs/source/getting_started/windows/_installation_standalone.rst @@ -4,7 +4,7 @@ Install ModelOpt-Windows as a Standalone Toolkit ================================================ -The TensorRT Model Optimizer - Windows (ModelOpt-Windows) can be installed as a standalone toolkit for quantizing ONNX models. Below are the setup steps: +The Model Optimizer - Windows (ModelOpt-Windows) can be installed as a standalone toolkit for quantizing ONNX models. Below are the setup steps: **1. Setup Prerequisites** diff --git a/docs/source/getting_started/windows/_installation_with_olive.rst b/docs/source/getting_started/windows/_installation_with_olive.rst index 977a29f16..a05155278 100644 --- a/docs/source/getting_started/windows/_installation_with_olive.rst +++ b/docs/source/getting_started/windows/_installation_with_olive.rst @@ -26,16 +26,16 @@ Setup Steps for Olive with ModelOpt-Windows - Above onnxruntime and onnxruntime-genai packages enable Olive workflow with DirectML Execution-Provider (EP). To use other EPs, install corresponding packages. - - Additionally, ensure that dependencies for TensorRT Model Optimizer - Windows are met as mentioned in the :ref:`Install-Page-Standalone-Windows`. + - Additionally, ensure that dependencies for Model Optimizer - Windows are met as mentioned in the :ref:`Install-Page-Standalone-Windows`. -**2. Configure Olive for TensorRT Model Optimizer – Windows** +**2. Configure Olive for Model Optimizer – Windows** - - **New Olive Pass:** Olive introduces a new pass, ``NVModelOptQuantization`` (or β€œnvmo”), specifically designed for model quantization using TensorRT Model Optimizer – Windows. - - **Add to Configuration:** To apply quantization to your target model, include this pass in the Olive configuration file. [Refer `phi3 `_ Olive example]. + - **New Olive Pass:** Olive introduces a new pass, ``NVModelOptQuantization`` (or β€œnvmo”), specifically designed for model quantization using Model Optimizer – Windows. + - **Add to Configuration:** To apply quantization to your target model, include this pass in the Olive configuration file. [Refer `phi3 `_ Olive example]. **3. Setup Other Passes in Olive Configuration** - - **Add Other Passes:** Add additional passes to the Olive configuration file as needed for the desired Olive workflow of your input model. [Refer `phi3 `_ Olive example] + - **Add Other Passes:** Add additional passes to the Olive configuration file as needed for the desired Olive workflow of your input model. [Refer `phi3 `_ Olive example] **4. Install other dependencies** @@ -61,5 +61,5 @@ Setup Steps for Olive with ModelOpt-Windows **Note**: -#. Currently, the TensorRT-Model Optimizer - Windows only supports Onnx Runtime GenAI based LLM models in the Olive workflow. -#. To try out different LLMs and EPs in the Olive workflow of ModelOpt-Windows, refer the details provided in `phi3 `_ Olive example. +#. Currently, the Model Optimizer - Windows only supports Onnx Runtime GenAI based LLM models in the Olive workflow. +#. To try out different LLMs and EPs in the Olive workflow of ModelOpt-Windows, refer the details provided in `phi3 `_ Olive example. diff --git a/docs/source/guides/0_support_matrix.rst b/docs/source/guides/0_support_matrix.rst index 308d82d99..69e860e4b 100644 --- a/docs/source/guides/0_support_matrix.rst +++ b/docs/source/guides/0_support_matrix.rst @@ -92,7 +92,7 @@ Model Support Matrix .. tab:: Linux - Please checkout the model support matrix `here `_. + Please checkout the model support matrix `here `_. .. tab:: Windows @@ -179,4 +179,4 @@ Model Support Matrix .. note:: - ``ONNX INT8 Max`` means INT8 (W8A8) quantization of ONNX model using Max calibration. Similar holds true for the term ``ONNX FP8 Max``. - The LLMs in above table are `GenAI `_ built LLMs unless specified otherwise. - - Check `examples `_ for specific instructions and scripts. + - Check `examples `_ for specific instructions and scripts. diff --git a/docs/source/guides/1_quantization.rst b/docs/source/guides/1_quantization.rst index 417a1cd2c..a838bfb10 100644 --- a/docs/source/guides/1_quantization.rst +++ b/docs/source/guides/1_quantization.rst @@ -8,7 +8,7 @@ ModelOpt is based on simulated quantization in the original precision to simulat optimize for the best trade-off between the accuracy of the model and different low-precision formats. To achieve actual speedups and memory savings, the model with simulated quantization can be exported to deployment frameworks, like TensorRT or TensorRT-LLM. Please refer to the -`TensorRT-Model-Optimizer GitHub repository `_ +`Model-Optimizer GitHub repository `_ for more details and examples. Below, you can find the documentation for the quantization toolkit in ModelOpt: diff --git a/docs/source/guides/3_pruning.rst b/docs/source/guides/3_pruning.rst index 786c286da..9609f8cc2 100644 --- a/docs/source/guides/3_pruning.rst +++ b/docs/source/guides/3_pruning.rst @@ -5,7 +5,7 @@ Pruning .. tip:: Checkout `Qwen 3 NeMo Minitron Pruning & Distillation `_ and - `ResNet20 on CIFAR-10 Notebook `_ + `ResNet20 on CIFAR-10 Notebook `_ for an end-to-end example of pruning. ModelOpt provides three main pruning methods (aka ``mode``) - Minitron, FastNAS and GradNAS - via a unified API @@ -61,7 +61,7 @@ Prerequisites #. Please see the API reference of :meth:`mtp.prune() ` for more details. Below we show an example using :class:`"fastnas" `. -For Minitron pruning, please refer to the `example snippet `_ in the pruning readme. +For Minitron pruning, please refer to the `example snippet `_ in the pruning readme. Perform pruning --------------- diff --git a/docs/source/guides/7_nas.rst b/docs/source/guides/7_nas.rst index 8cd22d214..cdad2933c 100644 --- a/docs/source/guides/7_nas.rst +++ b/docs/source/guides/7_nas.rst @@ -643,7 +643,7 @@ The difference between NAS and pruning is summarized below. [Advanced] Adding a new NAS/Prune Algorithm =========================================== -* Please refer to this `template `_ +* Please refer to this `template `_ for adding a new NAS algorithm. -* Please refer to `mcore_minitron.py `_ - for an actual example of adding Minitron Pruning algorithm. \ No newline at end of file +* Please refer to `mcore_minitron.py `_ + for an actual example of adding Minitron Pruning algorithm. diff --git a/docs/source/guides/_customized_model_quantization.rst b/docs/source/guides/_customized_model_quantization.rst index 78525c684..c75c63739 100644 --- a/docs/source/guides/_customized_model_quantization.rst +++ b/docs/source/guides/_customized_model_quantization.rst @@ -15,7 +15,7 @@ As ModelOpt cannot detect these linear ops out-of-the-box, a HugggingFace plugin #. Define a customized ``_QuantDbrxExpertGLU`` as a ``DynamicModule`` with the same ``forward`` signature. #. Rewrite the linear ops (w1, v1 and v2) as a standard ``nn.Linear`` op, and re-implement the ``forward`` method. #. Register the new dynamic ``_QuantDbrxExperts`` to replace the ``DbrxExperts`` from the modeling_dbrx.py in the ``transformers`` library -#. Try quantize the DBRX model after the plugin is implemented, feel free to follow the `llm_ptq example `_. +#. Try quantize the DBRX model after the plugin is implemented, feel free to follow the `llm_ptq example `_. #. TensorRT-LLM is open-sourced. If this customized model is not supported by TensorRT-LLM yet, please modify :meth:`export_tensorrt_llm_checkpoint ` or :meth:`export_hf_checkpoint ` to export the quantized model for deployment with a customized TensorRT-LLM modeling implementation. Feel free to :doc:`contact us <../support/1_contact>` if further support is needed. The following code snippet is excerpted from ``modelopt/torch/quantization/plugins/huggingface.py`` diff --git a/docs/source/guides/windows_guides/_ONNX_PTQ_guide.rst b/docs/source/guides/windows_guides/_ONNX_PTQ_guide.rst index 15bf30af4..9e60611c0 100644 --- a/docs/source/guides/windows_guides/_ONNX_PTQ_guide.rst +++ b/docs/source/guides/windows_guides/_ONNX_PTQ_guide.rst @@ -20,7 +20,7 @@ This transformation reduces model size and allows deployment on systems with lim ModelOpt-Windows Quantization ----------------------------- -The TensorRT Model Optimizer - Windows is designed to create optimized ONNX models for DirectML and TensorRT* backends. +The Model Optimizer - Windows is designed to create optimized ONNX models for DirectML and TensorRT* backends. **Supported Techniques**: @@ -69,7 +69,7 @@ As an example, preparing calibration data for INT4 AWQ quantization of LLMs may 1. **Generate Token Encodings**: Use a dataset like *cnn-dailymail* or *pile* with the model's tokenizer to generate token encodings and related data from the representative dataset 2. **Format for Model Input**: Convert encodings into model-compatible formats. -Please refer the `example scripts `_ for details about preparing calibration-data of various supported ONNX models. +Please refer the `example scripts `_ for details about preparing calibration-data of various supported ONNX models. **Call Quantization API** @@ -155,4 +155,4 @@ To save a quantized ONNX model with external data, use the following code: Deploy Quantized ONNX Model --------------------------- -Inference of the quantized models can be done using tools like `GenAI `_, `OnnxRunTime (ORT) `_. These APIs can do inference on backends like DML. For details about DirectML deployment of quantized models, see :ref:`DirectML_Deployment`. Also, refer `example scripts `_ for any possible model-specific inference guidance or script (if any). +Inference of the quantized models can be done using tools like `GenAI `_, `OnnxRunTime (ORT) `_. These APIs can do inference on backends like DML. For details about DirectML deployment of quantized models, see :ref:`DirectML_Deployment`. Also, refer `example scripts `_ for any possible model-specific inference guidance or script (if any). diff --git a/docs/source/index.rst b/docs/source/index.rst index 9f55e0f89..5f08de7bf 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -7,14 +7,14 @@ Welcome to Model Optimizer (ModelOpt) documentation! :caption: Getting Started getting_started/[0-9]* - Quick Start: PTQ - PyTorch - Quick Start: PTQ - ONNX - Quick Start: PTQ - Windows - Quick Start: QAT - Quick Start: Pruning - Quick Start: Distillation - Quick Start: Speculative Decoding - Quick Start: Sparsity + Quick Start: PTQ - PyTorch + Quick Start: PTQ - ONNX + Quick Start: PTQ - Windows + Quick Start: QAT + Quick Start: Pruning + Quick Start: Distillation + Quick Start: Speculative Decoding + Quick Start: Sparsity .. toctree:: :glob: diff --git a/docs/source/support/1_contact.rst b/docs/source/support/1_contact.rst index ae602443e..29eb404e3 100644 --- a/docs/source/support/1_contact.rst +++ b/docs/source/support/1_contact.rst @@ -3,4 +3,4 @@ Contact us ========== -Contact us by submitting issues on `GitHub `_. +Contact us by submitting issues on `GitHub `_. diff --git a/examples/benchmark.md b/examples/benchmark.md index 22671fb82..104a2ed22 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -1,6 +1,6 @@ -# TensorRT Model Optimizer Benchmark Reference +# Model Optimizer Benchmark Reference -This document summarizes performance and accuracy measurements of [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) for a few popular models. +This document summarizes performance and accuracy measurements of [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) for a few popular models. The benchmark in the following tables is provided as reference points and **should not be considered as the peak performance** that can be delivered by Model Optimizer. All performance numbers are tested with [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) or [TensorRT](https://developer.nvidia.com/tensorrt-getting-started). diff --git a/examples/cnn_qat/README.md b/examples/cnn_qat/README.md index 6afbd7699..c421ce868 100644 --- a/examples/cnn_qat/README.md +++ b/examples/cnn_qat/README.md @@ -2,7 +2,7 @@ Quantization-Aware Training (QAT) with NVIDIA ModelOpt injects simulated quantization noise during training to recover accuracy lost by Post-Training Quantization (PTQ). A CNN model quantized via `mtq.quantize()` can be fine-tuned using your existing training loop. During QAT, the quantizer scales are frozen while the model weights adapt. -Learn more in the [ModelOpt QAT guide](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_pytorch_quantization.html#quantization-aware-training-qat). +Learn more in the [ModelOpt QAT guide](https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#quantization-aware-training-qat). > **_NOTE:_** This example uses a TorchVision ResNet-50 on an ImageNet-style dataset, but you can extend the same steps to any CNN and computer-vision dataset. @@ -66,7 +66,7 @@ model.to(device) See the full script [torchvision_qat.py](./torchvision_qat.py) for all boilerplate (argument parsing, DDP setup, logging, etc.). -> **_NOTE:_** The example above uses [mto.save](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/2_save_load.html#saving-modelopt-models) and [mto.restore](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/2_save_load.html#restoring-modelopt-models) for saving and restoring ModelOpt modified models. These functions handle the model weights as well as the quantizer states. Please see [saving & restoring](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/2_save_load.html) to learn more. +> **_NOTE:_** The example above uses [mto.save](https://nvidia.github.io/Model-Optimizer/guides/2_save_load.html#saving-modelopt-models) and [mto.restore](https://nvidia.github.io/Model-Optimizer/guides/2_save_load.html#restoring-modelopt-models) for saving and restoring ModelOpt modified models. These functions handle the model weights as well as the quantizer states. Please see [saving & restoring](https://nvidia.github.io/Model-Optimizer/guides/2_save_load.html) to learn more. ### End-to-end QAT Example diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md index 0aba1118f..5c514ca4a 100644 --- a/examples/diffusers/README.md +++ b/examples/diffusers/README.md @@ -11,12 +11,12 @@ Cache Diffusion is a technique that reuses cached outputs from previous diffusio | **Section** | **Description** | **Link** | **Docs** | | :------------: | :------------: | :------------: | :------------: | | Pre-Requisites | Required & optional packages to use this technique | \[[Link](#pre-requisites)\] | | -| Getting Started | Learn how to optimize your models using quantization/cache diffusion to reduce precision and improve inference efficiency | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | -| Support Matrix | View the support matrix to see quantization/cahce diffusion compatibility and feature availability across different models | \[[Link](#support-matrix)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| Getting Started | Learn how to optimize your models using quantization/cache diffusion to reduce precision and improve inference efficiency | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | +| Support Matrix | View the support matrix to see quantization/cahce diffusion compatibility and feature availability across different models | \[[Link](#support-matrix)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | | Cache Diffusion | Caching technique to accelerate inference without compromising quality | \[[Link](#cache-diffusion)\] | | -| Post Training Quantization (PTQ) | Example scripts on how to run PTQ on diffusion models | \[[Link](#post-training-quantization-ptq)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | -| Quantization Aware Training (QAT) | Example scripts on how to run QAT on diffusion models | \[[Link](#quantization-aware-training-qat)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | -| Quantization Aware Distillation (QAD) | Example scripts on how to run QAD on diffusion models | \[[Link](#quantization-aware-distillation-qad)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| Post Training Quantization (PTQ) | Example scripts on how to run PTQ on diffusion models | \[[Link](#post-training-quantization-ptq)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | +| Quantization Aware Training (QAT) | Example scripts on how to run QAT on diffusion models | \[[Link](#quantization-aware-training-qat)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | +| Quantization Aware Distillation (QAD) | Example scripts on how to run QAD on diffusion models | \[[Link](#quantization-aware-distillation-qad)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | | Build and Run with TensorRT | How to build and run your quantized model with TensorRT | \[[Link](#build-and-run-with-tensorrt-compiler-framework)\] | | | LoRA | Fuse your LoRA weights prior to quantization | \[[Link](#lora)\] | | | Evaluate Accuracy | Evaluate your model's accuracy! | \[[Link](#evaluate-accuracy)\] | | @@ -29,7 +29,7 @@ Cache Diffusion is a technique that reuses cached outputs from previous diffusio ### Docker -Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. +Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more information. Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies. @@ -45,13 +45,13 @@ Each subsection (cache_diffusion, quantization, etc.) have their own `requiremen You can find the latest TensorRT [here](https://developer.nvidia.com/tensorrt/download). -Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. +Visit our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more information. ## Getting Started ### Quantization -With the simple API below, you can very easily use Model Optimizer to quantize your model. Model Optimizer achieves this by converting the precision of your model to the desired precision, and then using a small dataset (typically 128-512 samples) to [calibrate](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_basic_quantization.html) the quantization scaling factors. +With the simple API below, you can very easily use Model Optimizer to quantize your model. Model Optimizer achieves this by converting the precision of your model to the desired precision, and then using a small dataset (typically 128-512 samples) to [calibrate](https://nvidia.github.io/Model-Optimizer/guides/_basic_quantization.html) the quantization scaling factors. ```python import modelopt.torch.quantization as mtq @@ -168,7 +168,7 @@ Once the model is loaded in its quantized state through ModelOPT, you can procee Distillation is a powerful approach where a high-precision model (the teacher) guides the training of a quantized model (the student). ModelOPT simplifies the process of combining distillation with QAT by handling most of the complexity for you. -For more details about distillation, please refer to this [link](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html). +For more details about distillation, please refer to this [link](https://nvidia.github.io/Model-Optimizer/guides/4_distillation.html). ```diff import modelopt.torch.opt as mto @@ -543,9 +543,9 @@ Example metrics obtained with 30 sampling steps on a set of 1K prompts (values w ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](../benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) diff --git a/examples/diffusers/quantization/quantize.py b/examples/diffusers/quantization/quantize.py index e9cd22b14..82abccc1a 100644 --- a/examples/diffusers/quantization/quantize.py +++ b/examples/diffusers/quantization/quantize.py @@ -36,7 +36,7 @@ # This is a workaround for making the onnx export of models that use the torch RMSNorm work. We will # need to move on to use dynamo based onnx export to properly fix the problem. The issue has been hit -# by both external users https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/262, and our +# by both external users https://github.com/NVIDIA/Model-Optimizer/issues/262, and our # internal users from MLPerf Inference. # if __name__ == "__main__": diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md index 9cb3efd6a..62f1435f9 100644 --- a/examples/gpt-oss/README.md +++ b/examples/gpt-oss/README.md @@ -27,7 +27,7 @@ pip install -r requirements.txt In Quantization Aware Training, the forward computations are performed with 'fake quantized' values and the backward computations are performed with high precision datatype. In 'fake quantization' the numerical equivalent of the quantized value is represented using a high precision datatype such as BF16. Hence, QAT can be integrated to standard training pipeline such as regular BF16 mixed precision training. -During QAT, the model learns to recover the accuracy after quantization. To perform QAT, quantize your model first using ModelOpt's [`mtq.quantize`](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.quantize) API. Then you can train this quantized model with your existing training pipeline. +During QAT, the model learns to recover the accuracy after quantization. To perform QAT, quantize your model first using ModelOpt's [`mtq.quantize`](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.quantize) API. Then you can train this quantized model with your existing training pipeline. Here is a code example: @@ -94,7 +94,7 @@ accelerate launch --config_file configs/zero3.yaml sft.py \ The final QAT checkpoint is in fake-quantized form. Low memory footprint and speedup comes after [deployment](#deployment) to accelerated runtimes. -Note: For restoring the model checkpoint for Pytorch native evaluation, see [ModelOpt Restore using Huggingface APIs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/2_save_load.html#modelopt-save-restore-using-huggingface-checkpointing-apis). +Note: For restoring the model checkpoint for Pytorch native evaluation, see [ModelOpt Restore using Huggingface APIs](https://nvidia.github.io/Model-Optimizer/guides/2_save_load.html#modelopt-save-restore-using-huggingface-checkpointing-apis). ## Deployment @@ -179,8 +179,8 @@ You can deploy this real quantized MXFP4 checkpoint just like the original GPT-O ### Easy QAT from ModelOpt using LLaMA-Factory -ModelOpt provides easy end to end QAT via [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), an open-source repository for LLM/VLM finetuning. Please refer to [LLaMa-Factory QAT example](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_qat/llama_factory) for performing QAT on your favorite models. +ModelOpt provides easy end to end QAT via [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), an open-source repository for LLM/VLM finetuning. Please refer to [LLaMa-Factory QAT example](../llm_qat/llama_factory) for performing QAT on your favorite models. ### Deployment of ModelOpt QAT/PTQ models beyond GPT-OSS -ModelOpt supports exporting a wide variety of models after QAT/PTQ to TensorRT-LLM, vLLM, SGLang etc. Please refer to [llm_ptq](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq). +ModelOpt supports exporting a wide variety of models after QAT/PTQ to TensorRT-LLM, vLLM, SGLang etc. Please refer to [llm_ptq](../llm_ptq). diff --git a/examples/llm_autodeploy/README.md b/examples/llm_autodeploy/README.md index cd910a282..c21f8c203 100644 --- a/examples/llm_autodeploy/README.md +++ b/examples/llm_autodeploy/README.md @@ -2,7 +2,7 @@ This guide demonstrates how to deploy mixed-precision models using ModelOpt's AutoQuant and TRT-LLM's AutoDeploy. -[ModelOpt's AutoQuant](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) is a post-training quantization (PTQ) algorithm that optimizes model quantization by selecting the best quantization format for each layer while adhering to user-defined compression constraints. This approach allows users to balance model accuracy and performance effectively. +[ModelOpt's AutoQuant](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) is a post-training quantization (PTQ) algorithm that optimizes model quantization by selecting the best quantization format for each layer while adhering to user-defined compression constraints. This approach allows users to balance model accuracy and performance effectively. [TRT-LLM's AutoDeploy](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/auto_deploy) is designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models like those from Hugging Face, to optimized inference environments with TRT-LLM. It automates graph transformations to integrate inference optimizations such as tensor parallelism, KV-caching and quantization. AutoDeploy supports optimized in-framework deployment, minimizing the amount of manual modification needed. diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md index 430303b65..0571d184a 100644 --- a/examples/llm_distill/README.md +++ b/examples/llm_distill/README.md @@ -11,11 +11,11 @@ This section focuses on demonstrating how to apply Model Optimizer to perform kn | **Section** | **Description** | **Link** | **Docs** | | :------------: | :------------: | :------------: | :------------: | | Pre-Requisites | Required & optional packages to use this technique | \[[Link](#pre-requisites)\] | | -| Getting Started | Learn how to optimize your models using distillation to produce more intellegant smaller models | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html)\] | +| Getting Started | Learn how to optimize your models using distillation to produce more intellegant smaller models | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/4_distillation.html)\] | | Support Matrix | View the support matrix to see compatibility and feature availability across different models | \[[Link](#support-matrix)\] | | | Distillation with Megatron-LM | Learn how to distill your models with Megatron-LM Framework | \[[Link](#knowledge-distillation-kd-in-nvidia-megatron-lm-framework)\] | | -| Distillation with NeMo | Learn how to distill your models with NeMo Framework | \[[Link](#knowledge-distillation-kd-in-nvidia-nemo-framework)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html)\] | -| Distillation with Huggingface | Learn how to distill your models with Hugging Face | \[[Link](#knowledge-distillation-kd-for-huggingface-models)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html)\] | +| Distillation with NeMo | Learn how to distill your models with NeMo Framework | \[[Link](#knowledge-distillation-kd-in-nvidia-nemo-framework)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/4_distillation.html)\] | +| Distillation with Huggingface | Learn how to distill your models with Hugging Face | \[[Link](#knowledge-distillation-kd-for-huggingface-models)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/4_distillation.html)\] | | Resources | Extra links to relevant resources | \[[Link](#resources)\] | | | NeMo Prune + Distill Simplified Flow | Example script demonstrating end-to-end pruning plus distillation in NeMo | \[[Link](../nemo_run/prune_distill/README.md)\] | | @@ -27,7 +27,7 @@ This section focuses on demonstrating how to apply Model Optimizer to perform kn For Hugging Face models, please use the PyTorch docker image (e.g., `nvcr.io/nvidia/pytorch:25.06-py3`). For NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.09`) which has all the dependencies installed. -Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. +Visit our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more information. Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies. @@ -74,7 +74,7 @@ distillation_model = mtd.convert(student_model, mode=[("kd_loss", distillation_c The `teacher_model` can be either a `nn.Module`, a callable which returns an `nn.Module`, or a tuple of `(model_cls, args, kwargs)`. The `criterion` is the distillation loss used between student and teacher tensors. The `loss_balancer` determines how the original and distillation losses are combined (if needed). -See [Distillation](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html) for more info. +See [Distillation](https://nvidia.github.io/Model-Optimizer/guides/4_distillation.html) for more info. ### Distill during training @@ -177,9 +177,9 @@ accelerate launch --config-file ./accelerate_config/fsdp2.yaml \ ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](../benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) diff --git a/examples/llm_eval/README.md b/examples/llm_eval/README.md index bad3ca477..c664a2f8e 100644 --- a/examples/llm_eval/README.md +++ b/examples/llm_eval/README.md @@ -53,7 +53,7 @@ python lm_eval_hf.py --model hf \ > **_NOTE:_** `MXFP8_DEFAULT_CFG` is one the [OCP Microscaling Formats (MX Formats)](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) family which defines a set of block-wise dynamic quantization formats. The specifications can be found in the [official documentation](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). Currently we support all MX formats for simulated quantization, including `MXFP8 (E5M2, E4M3), MXFP6 (E3M2, E2M3), MXFP4, MXINT8`. However, only `MXFP8 (E4M3)` is in our example configurations, users can create their own configurations for other MX formats by simply modifying the `num_bits` field in the `MXFP8_DEFAULT_CFG`. -> **_NOTE:_** ModelOpt's triton kernels give faster NVFP4 simulated quantization. For details, please see the [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/_installation_for_Linux.html#accelerated-quantization-with-triton-kernels). +> **_NOTE:_** ModelOpt's triton kernels give faster NVFP4 simulated quantization. For details, please see the [installation guide](https://nvidia.github.io/Model-Optimizer/getting_started/_installation_for_Linux.html#accelerated-quantization-with-triton-kernels). For data-parallel evaluation, launch with `accelerate launch --multi_gpu --num_processes ` (as shown earlier). @@ -241,7 +241,7 @@ This is useful for evaluating quantized models deployed with vLLM or any model s --tensor-parallel-size # Adjust as needed ``` - To generate the quantized model such as `nvidia/Llama-3.1-8B-Instruct-FP8`, please refer to instructions [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#deploy-fp8-quantized-model-using-vllm-and-sglang). Note currently modelopt quantized model support in vLLM is limited, we are working on expanding the model and quant formats support. + To generate the quantized model such as `nvidia/Llama-3.1-8B-Instruct-FP8`, please refer to instructions [here](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq#deploy-fp8-quantized-model-using-vllm-and-sglang). Note currently modelopt quantized model support in vLLM is limited, we are working on expanding the model and quant formats support. 1. **Make the script executable (if not already):** diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 4009fbd6d..8a1008367 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -11,13 +11,13 @@ This section focuses on Post-training quantization, a technique that reduces mod | **Section** | **Description** | **Link** | **Docs** | | :------------: | :------------: | :------------: | :------------: | | Pre-Requisites | Required & optional packages to use this technique | \[[Link](#pre-requisites)\] | | -| Getting Started | Learn how to optimize your models using PTQ to reduce precision and improve inference efficiency | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| Getting Started | Learn how to optimize your models using PTQ to reduce precision and improve inference efficiency | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | | Support Matrix | View the support matrix to see quantization compatibility and feature availability across different models | \[[Link](#support-matrix)\] | | -| AutoQuantize | Automatically chooses layers/precisions for mixed precision quantization to enhanced inference performance and accuracy tradeoffs | \[[Link](#autoquantize)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_pytorch_quantization.html#optimal-partial-quantization-using-auto-quantize)\] | -| Real Quant | Real Quant compresses model weights in a low-precision format to reduce memory requirements of quantization. | \[[Link](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_compress_quantized_models.html)\] | | +| AutoQuantize | Automatically chooses layers/precisions for mixed precision quantization to enhanced inference performance and accuracy tradeoffs | \[[Link](#autoquantize)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#optimal-partial-quantization-using-auto-quantize)\] | +| Real Quant | Real Quant compresses model weights in a low-precision format to reduce memory requirements of quantization. | \[[Link](https://nvidia.github.io/Model-Optimizer/guides/_compress_quantized_models.html)\] | | | Framework Scripts | Example scripts demonstrating quantization techniques for optimizing Hugging Face / NeMo / Megatron-LM models | \[[Link](#framework-scripts)\] | | | Evaluate Accuracy | Evaluate your model's accuracy! | \[[Link](#evaluate-accuracy)\] | | -| Exporting Checkpoints | Export to Hugging Face Unified Checkpoint and deploy on TRT-LLM/vLLM/SGLang | \[[Link](#exporting-checkpoints)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/3_unified_hf.html)\] | +| Exporting Checkpoints | Export to Hugging Face Unified Checkpoint and deploy on TRT-LLM/vLLM/SGLang | \[[Link](#exporting-checkpoints)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/deployment/3_unified_hf.html)\] | | Pre-Quantized Checkpoints | Ready to deploy Hugging Face pre-quantized checkpoints | \[[Link](#pre-quantized-checkpoints)\] | | | Resources | Extra links to relevant resources | \[[Link](#resources)\] | | @@ -29,7 +29,7 @@ This section focuses on Post-training quantization, a technique that reduces mod For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2`). For NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.09`). -Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. +Visit our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more information. Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies. @@ -49,7 +49,7 @@ Similarly, for vLLM or SGLang deployment, please use their installation docs. ### 1. Quantize (Post Training Quantization) -With the simple API below, you can very easily use Model Optimizer to quantize your model. Model Optimizer achieves this by converting the precision of your model to the desired precision, and then using a small dataset (typically 128-512 samples) to [calibrate](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_basic_quantization.html) the quantization scaling factors. The accuracy of PTQ is typically robust across different choices of calibration data, by default Model Optimizer uses a mix of [`cnn_dailymail`](https://huggingface.co/datasets/abisee/cnn_dailymail) and [`nemotron-post-training-dataset-v2`](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2). Users can try other datasets by easily modifying the `calib_set`. +With the simple API below, you can very easily use Model Optimizer to quantize your model. Model Optimizer achieves this by converting the precision of your model to the desired precision, and then using a small dataset (typically 128-512 samples) to [calibrate](https://nvidia.github.io/Model-Optimizer/guides/_basic_quantization.html) the quantization scaling factors. The accuracy of PTQ is typically robust across different choices of calibration data, by default Model Optimizer uses a mix of [`cnn_dailymail`](https://huggingface.co/datasets/abisee/cnn_dailymail) and [`nemotron-post-training-dataset-v2`](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2). Users can try other datasets by easily modifying the `calib_set`. ```python import modelopt.torch.quantization as mtq @@ -110,7 +110,7 @@ with torch.inference_mode(): After the TensorRT-LLM checkpoint export, you can use the `trtllm-build` build command to build the engines from the exported checkpoints. Please check the [TensorRT-LLM Build API](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/architecture/workflow.md#build-apis) documentation for reference. -Please reference our [framework scripts](#framework-scripts) and our [docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html) for more details. +Please reference our [framework scripts](#framework-scripts) and our [docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html) for more details. ## Support Matrix @@ -143,7 +143,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http > *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead.* -> You can also create your own custom config using [this](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_pytorch_quantization.html#custom-calibration-algorithm) guide. +> You can also create your own custom config using [this](https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#custom-calibration-algorithm) guide. ### NeMo Supported Models @@ -151,7 +151,7 @@ Please refer to the [NeMo 2.0 PTQ documentation](https://docs.nvidia.com/nemo-fr ## AutoQuantize -[AutoQuantize (`mtq.auto_quantize`)](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) is a PTQ algorithm which quantizes a model by searching for the best quantization format per-layer while meeting performance constraints specified by the user. `AutoQuantize` streamlines the trade-off of model accuracy and performance. +[AutoQuantize (`mtq.auto_quantize`)](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) is a PTQ algorithm which quantizes a model by searching for the best quantization format per-layer while meeting performance constraints specified by the user. `AutoQuantize` streamlines the trade-off of model accuracy and performance. Currently `AutoQuantize` supports only `auto_quantize_bits` as the performance constraint (for both weight-only quantization and weight & activation quantization). `auto_quantize_bits` constraint specifies the effective number of bits for the quantized model. @@ -160,7 +160,7 @@ You may specify an `auto_quantize_bits` constraint such as 4.8 for mixed precisi `AutoQuantize` will automatically quantize highly sensitive layers in `FP8_DEFAULT_CFG` while keeping less sensitive layers in `NVFP4_DEFAULT_CFG` (and even skip quantization for any extremely sensitive layers) so that the the final mixed precision quantized model has an effective quantized bits of 4.8. This model would give a better accuracy than the model quantized with vanilla `NVFP4_DEFAULT_CFG` configuration since the more aggressive `NVFP4_DEFAULT_CFG` quantization was not applied for the highly sensitive layers. -Here is an example usage for `AutoQuantize` algorithm (Please see [auto_quantize](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) API for more details): +Here is an example usage for `AutoQuantize` algorithm (Please see [auto_quantize](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) API for more details): ```python @@ -432,18 +432,18 @@ After the TensorRT-LLM checkpoint export, you can use the `trtllm-build` build c ## Pre-Quantized Checkpoints -- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia TensorRT Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] +- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] - Deployable on [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang) - More models coming soon! ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](../benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) ### Technical Resources diff --git a/examples/llm_ptq/notebooks/1_FP4-FP8_PTQ_Min-Max_Calibration.ipynb b/examples/llm_ptq/notebooks/1_FP4-FP8_PTQ_Min-Max_Calibration.ipynb index dbf1339d7..50b53ac38 100644 --- a/examples/llm_ptq/notebooks/1_FP4-FP8_PTQ_Min-Max_Calibration.ipynb +++ b/examples/llm_ptq/notebooks/1_FP4-FP8_PTQ_Min-Max_Calibration.ipynb @@ -5,9 +5,9 @@ "id": "9161daa2-03a6-41cd-b349-410004ab37c4", "metadata": {}, "source": [ - "# Post-Training Quantization with Min-Max Calibration using TensorRT Model Optimizer PTQ\n", + "# Post-Training Quantization with Min-Max Calibration using Model Optimizer PTQ\n", "\n", - "This notebook demonstrates how to apply standard Post-Training Quantization (PTQ) using min-max calibration on an LLMβ€”specifically meta-llama/Llama-3.1-8B-Instructβ€”with NVIDIA's TensorRT Model Optimizer (ModelOpt) PTQ toolkit. We walk through loading the model, calibrating it using a CNN/DailyMail dataset sample, applying FP8 quantization, generating outputs, and exporting the quantized model.\n", + "This notebook demonstrates how to apply standard Post-Training Quantization (PTQ) using min-max calibration on an LLMβ€”specifically meta-llama/Llama-3.1-8B-Instructβ€”with NVIDIA's Model Optimizer (ModelOpt) PTQ toolkit. We walk through loading the model, calibrating it using a CNN/DailyMail dataset sample, applying FP8 quantization, generating outputs, and exporting the quantized model.\n", "\n", "Key Dependencies:\n", "- nvidia-modelopt\n", @@ -33,7 +33,7 @@ "\n", "- `torch`: Used for tensor computation and model execution.\n", "\n", - "- `modelopt.torch.quantization`: Core API for quantization using TensorRT ModelOpt PTQ.\n", + "- `modelopt.torch.quantization`: Core API for quantization using ModelOpt PTQ.\n", "\n", "- `transformers`: Hugging Face interface to load and tokenize LLMs.\n", "\n", @@ -174,7 +174,7 @@ "metadata": {}, "source": [ "### 6. Set Quantization Configuration and Apply\n", - "- Apply FP8 quantization using the default min-max config provided by TensorRT ModelOpt.\n", + "- Apply FP8 quantization using the default min-max config provided by ModelOpt.\n", "- This pass captures the range of activations and applies a quantization transform.\n", "- To change the quantization configuration, you simply need to change the value of the `quant_cfg` variable. For example, to change this from FP8 to NVFP4, you can set it to `mtq.NVFP4_DEFAULT_CFG`\n", "\n", @@ -262,7 +262,7 @@ "# βœ… Conclusion & Key Takeaways\n", " βœ… Min-max calibration is a fast and simple way to apply quantization with good performance tradeoffs.\n", "\n", - " βœ… TensorRT-LLM ModelOpt PTQ abstracts away many of the complexities of quantization while still offering flexibility and export options.\n", + " βœ… ModelOpt PTQ abstracts away many of the complexities of quantization while still offering flexibility and export options.\n", "\n", " βœ… Using a representative dataset like cnn_dailymail improves calibration accuracy for summarization-style models.\n", "\n", diff --git a/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb b/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb index e62a72ae7..79a545427 100644 --- a/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb +++ b/examples/llm_ptq/notebooks/2_PTQ_AWQ_Calibration.ipynb @@ -5,7 +5,7 @@ "id": "9161daa2-03a6-41cd-b349-410004ab37c4", "metadata": {}, "source": [ - "# Weight-Only INT4 Quantization with AWQ using TensorRT ModelOpt PTQ\n", + "# Weight-Only INT4 Quantization with AWQ using ModelOpt PTQ\n", "\n", "This notebook demonstrates how to apply weight-only INT4 quantization using the Activation-aware Weight Quantization (AWQ) technique via NVIDIA TensorRT-LLM Model Optimizer (ModelOpt) PTQ.\n", "\n", @@ -35,7 +35,7 @@ "\n", "- `torch`: Used for tensor computation and model execution.\n", "\n", - "- `modelopt.torch.quantization`: Core API for quantization using TensorRT ModelOpt PTQ.\n", + "- `modelopt.torch.quantization`: Core API for quantization using ModelOpt PTQ.\n", "\n", "- `transformers`: Hugging Face interface to load and tokenize LLMs.\n", "\n", @@ -273,7 +273,7 @@ "\n", " βœ… Block-wise quantization (e.g., block size = 128) enables hardware-friendly tensor layouts that optimize for tensor core utilization on NVIDIA GPUs.\n", "\n", - " βœ… The TensorRT-LLM ModelOpt PTQ API provides a flexible and high-level interface for experimenting with quantization formats, including full customization of AWQ configs.\n", + " βœ… ModelOpt PTQ API provides a flexible and high-level interface for experimenting with quantization formats, including full customization of AWQ configs.\n", "\n", " βœ… Exported models remain compatible with Hugging Face interfaces, making them easy to use in production pipelines or deploy via inference frameworks like vLLM or TensorRT-LLM." ] diff --git a/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb b/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb index 8d5a79374..122569489 100644 --- a/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb +++ b/examples/llm_ptq/notebooks/3_PTQ_AutoQuantization.ipynb @@ -5,7 +5,7 @@ "id": "b80da483-f81a-408f-9755-6c1bc9d84119", "metadata": {}, "source": [ - "# AutoQuantization with TensorRT Model Optimizer PTQ\n", + "# AutoQuantization with Model Optimizer PTQ\n", "\n", "This notebook demonstrates how to use ModelOpt PTQ's auto_quantize feature to perform automated mixed-precision quantization on the Meta-LLaMA-3-8B model. You'll define a target effective bit rate (e.g., 8.0), provide a search space of quantization formats, and optionally include KV cache quantization.\n", "\n", @@ -386,7 +386,7 @@ "metadata": {}, "source": [ "# βœ… Conclusion & Key Takeaways\n", - " βœ… AutoQuant in TensorRT-LLM ModelOpt enables fast, automated mixed-precision quantization by searching across multiple formats (e.g., FP8, INT4-AWQ) to meet a user-defined effective bit constraint.\n", + " βœ… AutoQuant in ModelOpt enables fast, automated mixed-precision quantization by searching across multiple formats (e.g., FP8, INT4-AWQ) to meet a user-defined effective bit constraint.\n", "\n", " βœ… Using a small calibration set with loss-based scoring, AutoQuant intelligently selects the optimal quantization format per layerβ€”balancing model size, performance, and accuracy.\n", "\n", diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 154b7c07b..75c2cf02b 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -7,10 +7,10 @@ Quantization Aware Training (QAT) helps to improve the model accuracy beyond pos | **Section** | **Description** | **Link** | **Docs** | | :------------: | :------------: | :------------: | :------------: | | Pre-Requisites | Required & optional packages to use this technique | \[[Link](#pre-requisites)\] | | -| Getting Started | Learn how to optimize your models using QAT to reduce precision and improve model accuracy post quantization | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| Getting Started | Learn how to optimize your models using QAT to reduce precision and improve model accuracy post quantization | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | | Support Matrix | View the support matrix to see quantization compatibility and feature availability across different models | \[[Link](#support-matrix)\] | | -| End to End QAT | Example scripts demonstrating quantization techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qat-example)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | -| End to End QAD | Example scripts demonstrating quantization aware distillation techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qad-example)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| End to End QAT | Example scripts demonstrating quantization techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qat-example)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | +| End to End QAD | Example scripts demonstrating quantization aware distillation techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qad-example)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | | NeMo QAT/QAD Simplified Flow | Example script demonstrating end-to-end QAT/QAD in NeMo | \[[Link](../nemo_run/qat/README.md)\] | | | Evaluate Accuracy | Evaluating model accuracy after QAT/QAD (with fake quantization) | \[[Link](#testing-qat-model-with-llm-benchmarks-for-accuracy-evaluation)\] | | | Deployment | Deploying the model after QAT/QAD | \[[Link](#deployment)\] | | @@ -26,9 +26,9 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#pre-requisites) for ## Getting Started -In QAT, a model quantized using [mtq.quantize()](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.quantize) can be directly fine-tuned with the original training pipeline. During QAT, the scaling factors inside quantizers are frozen and the model weights are fine-tuned. +In QAT, a model quantized using [mtq.quantize()](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.quantize) can be directly fine-tuned with the original training pipeline. During QAT, the scaling factors inside quantizers are frozen and the model weights are fine-tuned. -To learn more about the QAT feature, please refer to the [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_pytorch_quantization.html#quantization-aware-training-qat). +To learn more about the QAT feature, please refer to the [documentation](https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#quantization-aware-training-qat). Quantization aware distillation (QAD) can be used to further improve accuracy of the model using the original full precision model as a teacher model in cases where QAT is not enough. @@ -42,7 +42,7 @@ The Llama3-8B fine-tuning and QAT below requires a minimum of 2 x 80GB GPUs per #### QAT Example Workflow -In QAT, a model quantized using [mtq.quantize()](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.quantize) can be directly fine-tuned with the original training pipeline. During QAT, the scaling factors inside quantizers are frozen and the model weights are fine-tuned. +In QAT, a model quantized using [mtq.quantize()](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.quantize) can be directly fine-tuned with the original training pipeline. During QAT, the scaling factors inside quantizers are frozen and the model weights are fine-tuned. Here is the recommended QAT workflow: @@ -91,10 +91,10 @@ trainer.train() # Train the quantized model (i.e, QAT) trainer.save_model() ``` -> **_NOTE:_** The example above uses [mto.modelopt_state](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.opt.conversion.html#modelopt.torch.opt.conversion.modelopt_state) and [mto.restore_from_modelopt_state](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.opt.conversion.html#modelopt.torch.opt.conversion.restore_from_modelopt_state) for saving and restoring of ModelOpt -> modified model. ModelOpt provides additional methods/workflows for saving and restoring ModelOpt modified model. Please see [saving & restoring](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/2_save_load.html) to learn more. +> **_NOTE:_** The example above uses [mto.modelopt_state](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.opt.conversion.html#modelopt.torch.opt.conversion.modelopt_state) and [mto.restore_from_modelopt_state](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.opt.conversion.html#modelopt.torch.opt.conversion.restore_from_modelopt_state) for saving and restoring of ModelOpt +> modified model. ModelOpt provides additional methods/workflows for saving and restoring ModelOpt modified model. Please see [saving & restoring](https://nvidia.github.io/Model-Optimizer/guides/2_save_load.html) to learn more. -> **_NOTE:_** ModelOpt provides accelerated quantization kernels using Triton that significantly speed up NVFP4 format QAT. For details, see the [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/_installation_for_Linux.html#accelerated-quantization-with-triton-kernels). +> **_NOTE:_** ModelOpt provides accelerated quantization kernels using Triton that significantly speed up NVFP4 format QAT. For details, see the [installation guide](https://nvidia.github.io/Model-Optimizer/getting_started/_installation_for_Linux.html#accelerated-quantization-with-triton-kernels). A simple QAT training example can be found in [simple_qat_train.py](simple_qat_train.py). It can train the model using a single GPU on [Daring-Anteater](https://huggingface.co/datasets/nvidia/Daring-Anteater) dataset. To run: @@ -169,7 +169,7 @@ This script supports the following models out of the box. ### Supported quantization configuration for QAT -Current quantization configs can be found [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/modelopt/torch/quantization/config.py). +Current quantization configs can be found [here](https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt/torch/quantization/config.py). These are the recommended quantization configurations for QAT: @@ -185,7 +185,7 @@ mtq.NVFP4_DEFAULT_CFG # NVFP4 dynamic block weight & activation quantization mtq.MXFP8_DEFAULT_CFG # MXFP8 per-tensor weight and activation quantization ``` -You can also create your own custom config using [this](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_pytorch_quantization.html#custom-calibration-algorithm) guide. +You can also create your own custom config using [this](https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#custom-calibration-algorithm) guide. ## End-to-end QAT Example @@ -364,15 +364,15 @@ vllm serve llama3-fp4-qlora-hf/base_model --enable-lora --lora-modules adapter=l > ## Pre-Quantized Checkpoints -- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia TensorRT Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] +- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] - Deployable on [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang) - More models coming soon! ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](../benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb index 0c292563c..a9bb6589b 100644 --- a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb +++ b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb @@ -15,7 +15,7 @@ "source": [ "**Quantization Aware Training (QAT)** is a method that learn the effects of quantization during neural network post-training to preserve accuracy when deploying models in very-low-precision formats. QAT inserts quantizer nodes into the computational graph, mimicking the rounding and clamping operations that occur during actual quantization. This allows the model to adapt its weights and activations to mitigate accuracy loss.\n", "\n", - "This notebook demonstrates how to apply Quantization Aware Training (QAT) to an LLM, Qwen3-8b in this example, with NVIDIA's TensorRT Model Optimizer (ModelOpt) QAT toolkit. We walk through downloading and loading the model, calibrates on a small eval subset, applying NVFP4 quantization and finally deploying the quantized model to TensorRT-LLM." + "This notebook demonstrates how to apply Quantization Aware Training (QAT) to an LLM, Qwen3-8b in this example, with NVIDIA's Model Optimizer (ModelOpt) QAT toolkit. We walk through downloading and loading the model, calibrates on a small eval subset, applying NVFP4 quantization and finally deploying the quantized model to TensorRT-LLM." ] }, { @@ -563,7 +563,7 @@ "source": [ "**Note:** The QAT checkpoint for `nvfp4` config can be created by using `--quant_cfg NVFP4_DEFAULT_CFG` in QAT example.\n", "\n", - "See more details on deployment of quantized model [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/cb98b221e1b1730226257e20b4c81ebb259fc2d6/examples/llm_ptq/README.md)." + "See more details on deployment of quantized model [here](https://github.com/NVIDIA/Model-Optimizer/blob/main/examples/llm_ptq/README.md)." ] }, { @@ -623,7 +623,7 @@ "metadata": {}, "source": [ "## Exporting Quantized Model for deployment\n", - "Before deploying the model with TensorRT-LLM you will need to export the model checkpoint files. This is similar to the step you take for a quantized PTQ Model. To export the unified Hugging Face checkpoints, which can be deployed on TensorRT-LLM Pytorch, vLLM and SGLang you will need to run the [huggingface_example.sh](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/llm_ptq/scripts/huggingface_example.sh) script found in the TensorRT Model Optimizer repo. " + "Before deploying the model with TensorRT-LLM you will need to export the model checkpoint files. This is similar to the step you take for a quantized PTQ Model. To export the unified Hugging Face checkpoints, which can be deployed on TensorRT-LLM Pytorch, vLLM and SGLang you will need to run the [huggingface_example.sh](https://github.com/NVIDIA/Model-Optimizer/blob/main/examples/llm_ptq/scripts/huggingface_example.sh) script found in the Model Optimizer repo. " ] }, { @@ -643,7 +643,7 @@ "source": [ "%%sh # [run in TensorRT-LLM container]\n", "\n", - "git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git" + "git clone https://github.com/NVIDIA/Model-Optimizer.git" ] }, { @@ -663,7 +663,7 @@ "source": [ "%%sh # [run in TensorRT-LLM container]\n", "\n", - "cd TensorRT-Model-Optimizer/\n", + "cd Model-Optimizer/\n", "pip install -e ." ] }, @@ -691,7 +691,7 @@ "\n", "# run conversion script\n", "cd ..\n", - "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4" + "bash Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4" ] }, { diff --git a/examples/llm_sparsity/README.md b/examples/llm_sparsity/README.md index e7b8b30e0..4fba746d3 100644 --- a/examples/llm_sparsity/README.md +++ b/examples/llm_sparsity/README.md @@ -1,8 +1,8 @@ # Sparsity for HuggingFace Models -In this tutorial, we demonstrate how to use Nvidia TensorRT Model Optimizer to perform Post-Training Sparsification (PTS) and Sparsity Aware Training (SAT) on a HuggingFace [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) model. +In this tutorial, we demonstrate how to use Nvidia Model Optimizer to perform Post-Training Sparsification (PTS) and Sparsity Aware Training (SAT) on a HuggingFace [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) model. -To learn more about the sparsity feature, please refer to the [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/6_sparsity.html). +To learn more about the sparsity feature, please refer to the [documentation](https://nvidia.github.io/Model-Optimizer/guides/6_sparsity.html). ## Getting Started diff --git a/examples/model_hub/README.md b/examples/model_hub/README.md index bf4b1035d..d52e25c18 100644 --- a/examples/model_hub/README.md +++ b/examples/model_hub/README.md @@ -27,4 +27,4 @@ To deploy and run on SGLang: python run_llama_fp8_sglang.py ``` -If you want to run post-training quantization with TensorRT Model Optimizer for your selected models, check [here](../llm_ptq/README.md). +If you want to run post-training quantization with Model Optimizer for your selected models, check [here](../llm_ptq/README.md). diff --git a/examples/nemo_run/prune_distill/README.md b/examples/nemo_run/prune_distill/README.md index 314c6f5a0..5056ffbbf 100644 --- a/examples/nemo_run/prune_distill/README.md +++ b/examples/nemo_run/prune_distill/README.md @@ -50,14 +50,14 @@ The resulting compressed student maintains competitive performance while being s You can run the example either locally or on a [Slurm cluster](ADVANCED.md). -To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.09 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container. +To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.09 or higher. Clone the `Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container. -- `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git` +- `git clone https://github.com/NVIDIA/Model-Optimizer.git` Example docker command: ```bash -docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer:/opt/TensorRT-Model-Optimizer --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash +docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/Model-Optimizer:/opt/TensorRT-Model-Optimizer --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash ``` You will also need to set your Huggingface token with `export HF_TOKEN=`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written. diff --git a/examples/nemo_run/qat/README.md b/examples/nemo_run/qat/README.md index 1a56d9513..ae110d51a 100644 --- a/examples/nemo_run/qat/README.md +++ b/examples/nemo_run/qat/README.md @@ -56,16 +56,16 @@ The resulting exported checkpoint also is much smaller in memory at 6.4GB compar You can run the example either locally or on a [Slurm cluster](ADVANCED.md). -To run the example locally, first clone the `TensorRT-Model-Optimizer` repository, then mount the repository to a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.09. After launching the Docker container, make sure to also set your HuggingFace token for dataset/model downloading. +To run the example locally, first clone the `Model-Optimizer` repository, then mount the repository to a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.09. After launching the Docker container, make sure to also set your HuggingFace token for dataset/model downloading. Set up repo: -- `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git` +- `git clone https://github.com/NVIDIA/Model-Optimizer.git` Run docker command (modify with your paths) and export the HuggingFace token: ```bash -docker run -v /home/user/:/home/user/ -v /home/user/TensorRT-Model-Optimizer/:/opt/TensorRT-Model-Optimizer/ --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash +docker run -v /home/user/:/home/user/ -v /home/user/Model-Optimizer/:/opt/TensorRT-Model-Optimizer/ --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.09 bash export HF_TOKEN= ``` diff --git a/examples/nemo_run/qat/nemo_qat_flow.py b/examples/nemo_run/qat/nemo_qat_flow.py index af5a602ec..0742916c9 100644 --- a/examples/nemo_run/qat/nemo_qat_flow.py +++ b/examples/nemo_run/qat/nemo_qat_flow.py @@ -86,7 +86,7 @@ def get_args(): type=str, default="nvfp4", choices=quant_cfg_choices_list, - help="TensorRT-Model-Optimizer quantization algorithm", + help="Model-Optimizer quantization algorithm", ) parser.add_argument( "--use-slurm", diff --git a/examples/onnx_ptq/README.md b/examples/onnx_ptq/README.md index b1d22b896..d352b847e 100644 --- a/examples/onnx_ptq/README.md +++ b/examples/onnx_ptq/README.md @@ -11,7 +11,7 @@ Model Optimizer enables highly performant quantization formats including NVFP4, | **Section** | **Description** | **Link** | **Docs** | | :------------: | :------------: | :------------: | :------------: | | Pre-Requisites | Required & optional packages to use this technique | [Link](#pre-requisites) | | -| Getting Started | Learn how to optimize your models using PTQ to reduce precision and improve inference efficiency | [Link](#getting-started) | [docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_onnx_quantization.html) | +| Getting Started | Learn how to optimize your models using PTQ to reduce precision and improve inference efficiency | [Link](#getting-started) | [docs](https://nvidia.github.io/Model-Optimizer/guides/_onnx_quantization.html) | | Support Matrix | View the ONNX export supported LLM models | [Link](#onnx-export-supported-llm-models) | | | PyTorch to ONNX | Example scripts demonstrating how to quantize with PyTorch and then convert to ONNX | [Link](#torch-quantization-to-onnx-example-for-mxfp8-int4-or-nvfp4-precision) | | | Advanced Features | Examples demonstrating use advanced ONNX quantization features | [Link](#advanced-features) | | @@ -24,7 +24,7 @@ Model Optimizer enables highly performant quantization formats including NVFP4, ### Docker -Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. +Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more information. Set the following environment variables inside the TensorRT docker. @@ -275,16 +275,16 @@ trtexec --onnx=/path/to/identity_neural_network.quant.onnx \ ## Pre-Quantized Checkpoints -- Ready-to-deploy checkpoints that can be exported to ONNX format (if supported as per the [Support Matrix](#onnx-export-supported-llm-models)) \[[πŸ€— Hugging Face - Nvidia TensorRT Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] +- Ready-to-deploy checkpoints that can be exported to ONNX format (if supported as per the [Support Matrix](#onnx-export-supported-llm-models)) \[[πŸ€— Hugging Face - Nvidia Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](../benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) ### Technical Resources diff --git a/examples/pruning/README.md b/examples/pruning/README.md index b25ee1849..509960a94 100644 --- a/examples/pruning/README.md +++ b/examples/pruning/README.md @@ -15,7 +15,7 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar | **Section** | **Description** | **Link** | **Docs** | | :------------: | :------------: | :------------: | :------------: | | Pre-Requisites | Required & optional packages to use this technique | \[[Link](#pre-requisites)\] | | -| Getting Started | Learn how to use the pruning API | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/3_pruning.html)\] | +| Getting Started | Learn how to use the pruning API | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/3_pruning.html)\] | | Support Matrix | View the support matrix to see available pruning algorithms and their compatibility with different models and frameworks | \[[Link](#support-matrix)\] | | | Pruning Guidelines | Guidelines for choosing how and how much to prune for best results | \[[Link](#pruning-guidelines)\] | | | Examples | Examples of different pruning methods | \[[Link](#examples)\] | | @@ -189,7 +189,7 @@ Some of the models pruned using Minitron method followed by distillation and pos ### FastNAS Pruning for PyTorch Computer Vision Models -Check out the FastNAS pruning example usage in the [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/3_pruning.html#pruning-and-subnet-search). +Check out the FastNAS pruning example usage in the [documentation](https://nvidia.github.io/Model-Optimizer/guides/3_pruning.html#pruning-and-subnet-search). You can also take a look at FastNAS pruning interactive notebook [cifar_resnet](./cifar_resnet.ipynb) in this directory which showcases the usage of FastNAS for pruning a ResNet 20 model for the CIFAR-10 dataset. The notebook @@ -204,8 +204,8 @@ with distillation and quantization. The example also demonstrates how to save an ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) diff --git a/examples/pruning/cifar_resnet.ipynb b/examples/pruning/cifar_resnet.ipynb index 1c6f10852..d797e182c 100644 --- a/examples/pruning/cifar_resnet.ipynb +++ b/examples/pruning/cifar_resnet.ipynb @@ -19,7 +19,7 @@ "\n", "All of this with just a few lines of code! Yes, it's that simple!\n", "\n", - "Let's first install `Model Optimizer` following the [installation steps](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html)." + "Let's first install `Model Optimizer` following the [installation steps](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html)." ] }, { @@ -462,7 +462,7 @@ "1. `mcore_minitron`: A pruning method developed by NVIDIA Research for pruning GPT, Mamba and Hybrid Transformer Mamba models in NVIDIA NeMo or Megatron-LM framework. It uses the activation magnitudes to prune the mlp, transformer attention heads, GQA query groups, mamba heads and head dimension, embedding hidden size and number of layers of the model.\n", "1. `gradnas`: A light-weight pruning method recommended for language models like Hugging Face BERT, GPT-J. It uses the gradient information to prune the model's linear layers and attention heads to meet the given constraints.\n", "\n", - "In this example, we will use the `fastnas` mode to prune the ResNet20 model for CIFAR-10 dataset. Checkout the [Model Optimizer GitHub repository](https://github.com/NVIDIA/TensorRT-Model-Optimizer) for more examples.\n", + "In this example, we will use the `fastnas` mode to prune the ResNet20 model for CIFAR-10 dataset. Checkout the [Model Optimizer GitHub repository](https://github.com/NVIDIA/Model-Optimizer) for more examples.\n", "\n", "Let's first use the FastNAS mode to convert a ResNet model and reduce its FLOPs, number of parameters, and latency." ] @@ -800,7 +800,7 @@ "\n", "As we see here, we have reduced the FLOPs and number of parameters which would also result in a improvement in latency with very little loss in accuracy. Good job!\n", "\n", - "Next: checkout the [Model Optimizer GitHub repository](https://github.com/NVIDIA/TensorRT-Model-Optimizer) for more examples." + "Next: checkout the [Model Optimizer GitHub repository](https://github.com/NVIDIA/Model-Optimizer) for more examples." ] } ], diff --git a/examples/speculative_decoding/README.md b/examples/speculative_decoding/README.md index 0555faedc..c495809bb 100644 --- a/examples/speculative_decoding/README.md +++ b/examples/speculative_decoding/README.md @@ -1,6 +1,6 @@ # Speculative Decoding -[![Documentation](https://img.shields.io/badge/Docs-TensorRT--Model--Optimizer-blue?logo=readthedocs&style=flat-square)](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/5_speculative_decoding.html) +[![Documentation](https://img.shields.io/badge/Docs-NVIDIA--Model--Optimizer-blue?logo=readthedocs&style=flat-square)](https://nvidia.github.io/Model-Optimizer/guides/5_speculative_decoding.html) Speculative decoding accelerates auto-regressive generation in large language models (LLMs) by leveraging a lightweight draft model to predict the next Ξ³ tokens. The main LLM then verifies these candidate tokens in a single forward pass. If the draft model correctly predicts Ξ± tokens, the LLM can accept and generate Ξ±+1 tokens per verification step, significantly improving generation speed. @@ -30,7 +30,7 @@ This example focuses on training with Hugging Face. To train with Megatron‑LM, ### Docker -Please use the PyTorch docker image (e.g., `nvcr.io/nvidia/pytorch:25.06-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. +Please use the PyTorch docker image (e.g., `nvcr.io/nvidia/pytorch:25.06-py3`) or visit our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html) for more information. Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install dataset and example-specific dependencies. @@ -61,7 +61,7 @@ bash train_eagle3_and_export.sh --base_model meta-llama/Llama-3.2-1B-Instruct -- This one-line command runs a minimal example workflow of training and exporting an EAGLE draft model in Modelopt. Specifically, it -- Initializes the draft model with [default settings](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/modelopt/torch/speculative/eagle/default_config.py#L18) +- Initializes the draft model with [default settings](https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt/torch/speculative/eagle/default_config.py#L18) - Fine-tunes the model on the [Daring-Anteater](https://huggingface.co/datasets/nvidia/Daring-Anteater) dataset - Evaluates the acceptance rate on [MT-Bench](https://huggingface.co/datasets/HuggingFaceH4/mt_bench_prompts) - Exports a checkpoint ready for deployment @@ -79,7 +79,7 @@ For small base models that fit in GPU memory, we can collocate them with draft m --eagle_config eagle_config.json ``` -This command will launch `main.py` with `accelerate`. See [section: interact with modelopt.torch.speculative](#interact-with-modelopttorchspeculative) for more details. +This command will launch `main.py` with `accelerate`. See [section: interact with modelopt.torch.speculative](#interact-with-modelopttorchspeculative) for more details. The saved modelopt checkpoint is similar in architecture to HF models. It can be further optimized through **ModelOpt**, e.g., PTQ and QAT. ## Training Draft Model with Offline Base Model @@ -92,7 +92,7 @@ We support two backends for generating base model hidden states. For better effc ```bash python collect_hidden_states/compute_hidden_states_trtllm.py \ - --model $BASE_MODEL \ + --model $BASE_MODEL \ --input-file input_conversations/daring-anteater.jsonl \ --output-dir $HIDDEN_STATES_DIR ``` @@ -216,7 +216,7 @@ To use your own datasets, please preprocess your data into a `.jsonl` file with ```json { - "conversation_id": , + "conversation_id": , "conversations": [{"role":, "content":}] } ``` @@ -256,7 +256,7 @@ This will produce a `d2t.pt` file in `save_dir`, which is the mapping from draft ### Configuring Draft Model -For EAGLE‑1 and EAGLE‑3 we provide a [default model architecture config](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/modelopt/torch/speculative/config.py#L37) in ModelOpt. You can override default settings by providing an additional JSON dict. In this example, we override `draft_vocab_size` in `eagle_config.json`: +For EAGLE‑1 and EAGLE‑3 we provide a [default model architecture config](https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt/torch/speculative/config.py#L37) in ModelOpt. You can override default settings by providing an additional JSON dict. In this example, we override `draft_vocab_size` in `eagle_config.json`: ```json { @@ -336,9 +336,9 @@ More models coming soon! ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](../benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) diff --git a/examples/speculative_decoding/example.ipynb b/examples/speculative_decoding/example.ipynb index e9a84a059..d9d9be3a6 100644 --- a/examples/speculative_decoding/example.ipynb +++ b/examples/speculative_decoding/example.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "## Prepare Data\n", - "In this example, we use the Daring-Anteater dataset. For improved accuracy, please refer to the [Data Synthesis Section](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/speculative_decoding#optional-data-synthesis) in the README." + "In this example, we use the Daring-Anteater dataset. For improved accuracy, please refer to the [Data Synthesis Section](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/speculative_decoding#optional-data-synthesis) in the README." ] }, { @@ -158,7 +158,7 @@ "source": [ "## Deploying on TensorRT-LLM\n", "\n", - "Here we show an example to deploy on TRT-LLM with `trtllm-serve` and [TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release). See [Deployment](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/speculative_decoding#deployment) section for more info. \n", + "Here we show an example to deploy on TRT-LLM with `trtllm-serve` and [TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release). See [Deployment](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/speculative_decoding#deployment) section for more info. \n", "\n", "First, we dump the `trtllm-serve` command and config file we need to `/tmp` folder." ] diff --git a/examples/vllm_serve/Dockerfile b/examples/vllm_serve/Dockerfile index 7fa28c5f1..352896ca2 100644 --- a/examples/vllm_serve/Dockerfile +++ b/examples/vllm_serve/Dockerfile @@ -12,14 +12,14 @@ RUN apt-get update && apt-get install -y \ build-essential \ && rm -rf /var/lib/apt/lists/* -# Copy the entire TensorRT-Model-Optimizer source code -COPY . TensorRT-Model-Optimizer +# Copy the entire Model-Optimizer source code +COPY . Model-Optimizer # Remove .git directory to reduce image size -RUN rm -rf TensorRT-Model-Optimizer/.git +RUN rm -rf Model-Optimizer/.git # Install modelopt from local source with all dependencies -RUN cd TensorRT-Model-Optimizer && \ +RUN cd Model-Optimizer && \ pip install -e ".[all,dev-test]" # Llama4 requires this @@ -29,7 +29,7 @@ RUN pip install flash-attn==2.7.4.post1 RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true # Install requirements from examples (excluding windows examples) -RUN find TensorRT-Model-Optimizer/examples -name "requirements.txt" | grep -v "windows" | while read req_file; do \ +RUN find Model-Optimizer/examples -name "requirements.txt" | grep -v "windows" | while read req_file; do \ echo "Installing from $req_file"; \ pip install -r "$req_file" || echo "Warning: Failed to install from $req_file"; \ done diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md index 05d01bf26..be577cb55 100644 --- a/examples/vlm_ptq/README.md +++ b/examples/vlm_ptq/README.md @@ -1,6 +1,6 @@ # Post-training quantization (PTQ) for Vision Language Models -To learn more about the quantization feature, please refer to the [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html). +To learn more about the quantization feature, please refer to the [documentation](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html). Quantization is an effective model optimization technique that compresses your models. Quantization with Model Optimizer can compress model size by 2x-4x, speeding up inference while preserving model quality. \ Model Optimizer enables highly performant quantization formats including NVFP4, FP8, INT8, INT4 and supports advanced algorithms such as SmoothQuant, AWQ, SVDQuant, and Double Quantization with easy-to-use Python APIs. @@ -12,7 +12,7 @@ This section focuses on Post-training quantization for VLM (Vision Language Mode | **Section** | **Description** | **Link** | **Docs** | | :------------: | :------------: | :------------: | :------------: | | Pre-Requisites | Required & optional packages to use this technique | \[[Link](#pre-requisites)\] | | -| Getting Started | Learn how to optimize your models using PTQ to reduce precision and improve inference efficiency | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| Getting Started | Learn how to optimize your models using PTQ to reduce precision and improve inference efficiency | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] | | Support Matrix | View the support matrix to see quantization compatibility and feature availability across different models | \[[Link](#support-matrix)\] | | | Framework Scripts | Example scripts demonstrating quantization techniques for optimizing Hugging Face / NeMo / Megatron-LM models | \[[Link](#framework-scripts)\] | | | Pre-Quantized Checkpoints | Ready to deploy Hugging Face pre-quantized checkpoints | \[[Link](#pre-quantized-checkpoints)\] | | @@ -62,15 +62,15 @@ scripts/huggingface_example.sh --model - ## Pre-Quantized Checkpoints -- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia TensorRT Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] +- Ready-to-deploy checkpoints \[[πŸ€— Hugging Face - Nvidia Model Optimizer Collection](https://huggingface.co/collections/nvidia/inference-optimized-checkpoints-with-model-optimizer)\] - Deployable on [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang) - More models coming soon! ## Resources -- πŸ“… [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146) -- πŸ“– [Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer) +- πŸ“… [Roadmap](https://github.com/NVIDIA/Model-Optimizer/issues/146) +- πŸ“– [Documentation](https://nvidia.github.io/Model-Optimizer) - 🎯 [Benchmarks](../benchmark.md) -- πŸ’‘ [Release Notes](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) -- πŸ› [File a bug](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=1_bug_report.md) -- ✨ [File a Feature Request](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/new?template=2_feature_request.md) +- πŸ’‘ [Release Notes](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) +- πŸ› [File a bug](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=1_bug_report.md) +- ✨ [File a Feature Request](https://github.com/NVIDIA/Model-Optimizer/issues/new?template=2_feature_request.md) diff --git a/examples/windows/Benchmark.md b/examples/windows/Benchmark.md index 6b85ac6bb..0105a7fad 100644 --- a/examples/windows/Benchmark.md +++ b/examples/windows/Benchmark.md @@ -1,6 +1,6 @@ -# TensorRT Model Optimizer - Windows: Benchmark Reference +# Model Optimizer - Windows: Benchmark Reference -This document provides a summary of the performance and accuracy measurements of [TensorRT Model Optimizer - Windows](./README.md) for several popular models. The benchmark results in the following tables serve as reference points and **should not be viewed as the maximum performance** achievable by Model Optimizer - Windows. +This document provides a summary of the performance and accuracy measurements of [Model Optimizer - Windows](./README.md) for several popular models. The benchmark results in the following tables serve as reference points and **should not be viewed as the maximum performance** achievable by Model Optimizer - Windows. ## 1 Performance And Accuracy Comparison: ONNX INT4 vs ONNX FP16 Models diff --git a/examples/windows/README.md b/examples/windows/README.md index edc0efca9..38b0682a9 100644 --- a/examples/windows/README.md +++ b/examples/windows/README.md @@ -1,10 +1,10 @@
-# NVIDIA TensorRT Model Optimizer - Windows +# NVIDIA Model Optimizer - Windows A Library to Quantize and Compress Deep Learning Models for Optimized Inference on Native Windows RTX GPUs -[![Documentation](https://img.shields.io/badge/Documentation-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-Model-Optimizer/) +[![Documentation](https://img.shields.io/badge/Documentation-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/Model-Optimizer/) [![version](https://img.shields.io/badge/v0.33.0-orange?label=Release)](https://pypi.org/project/nvidia-modelopt/) [![license](https://img.shields.io/badge/License-Apache%202.0-blue)](../../LICENSE) @@ -32,7 +32,7 @@ A Library to Quantize and Compress Deep Learning Models for Optimized Inference ## Overview -The **TensorRT Model Optimizer - Windows** (**ModelOpt-Windows**) is engineered to deliver advanced model compression techniques, including quantization, to Windows RTX PC systems. Specifically tailored to meet the needs of Windows users, ModelOpt-Windows is optimized for rapid and efficient quantization, featuring local GPU calibration, reduced system and video memory consumption, and swift processing times. +The **Model Optimizer - Windows** (**ModelOpt-Windows**) is engineered to deliver advanced model compression techniques, including quantization, to Windows RTX PC systems. Specifically tailored to meet the needs of Windows users, ModelOpt-Windows is optimized for rapid and efficient quantization, featuring local GPU calibration, reduced system and video memory consumption, and swift processing times. The primary objective of the ModelOpt-Windows is to generate optimized, standards-compliant ONNX-format models. This makes it an ideal solution for seamless integration with ONNX Runtime (ORT) and DirectML (DML) frameworks, ensuring broad compatibility with any inference framework supporting the ONNX standard. Furthermore, ModelOpt-Windows integrates smoothly within the Windows ecosystem, with full support for tools and SDKs such as Olive and ONNX Runtime, enabling deployment of quantized models across various independent hardware vendors (IHVs) through the DML path and TensorRT path. Model Optimizer is available for free for all developers on [NVIDIA PyPI](https://pypi.org/project/nvidia-modelopt/). This repository is for sharing examples and GPU-optimized recipes as well as collecting feedback from the community. @@ -59,7 +59,7 @@ pip install onnxruntime-genai-directml>=0.4.0 pip install onnxruntime-directml==1.20.0 ``` -For more details, please refer to the [detailed installation instructions](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/windows/_installation_for_Windows.html). +For more details, please refer to the [detailed installation instructions](https://nvidia.github.io/Model-Optimizer/getting_started/windows/_installation_for_Windows.html). ## Techniques @@ -67,7 +67,7 @@ For more details, please refer to the [detailed installation instructions](https Quantization is an effective model optimization technique for large models. Quantization with ModelOpt-Windows can compress model size by 2x-4x, speeding up inference while preserving model quality. ModelOpt-Window enables highly performant quantization formats including INT4, FP8, INT8, etc. and supports advanced algorithms such as AWQ and SmoothQuant\* focusing on post-training quantization (PTQ) for ONNX and PyTorch\* models with DirectML, CUDA and TensorRT\* inference backends. -For more details, please refer to the [detailed quantization guide](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/windows_guides/_ONNX_PTQ_guide.html). +For more details, please refer to the [detailed quantization guide](https://nvidia.github.io/Model-Optimizer/guides/windows_guides/_ONNX_PTQ_guide.html). ## Getting Started @@ -94,13 +94,13 @@ onnx.save_model( Check `modelopt.onnx.quantization.quantize_int4` for details about INT4 quantization API. -Refer to our [Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html#support-matrix) for details about supported features and models. +Refer to our [Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html#support-matrix) for details about supported features and models. -To learn more about ONNX PTQ, refer to our [docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/windows_guides/_ONNX_PTQ_guide.html#onnx-ptq-guide-windows). +To learn more about ONNX PTQ, refer to our [docs](https://nvidia.github.io/Model-Optimizer/guides/windows_guides/_ONNX_PTQ_guide.html#onnx-ptq-guide-windows). ### Deployment -The quantized onnx model can be deployed using frameworks like onnxruntime. Ensure that model’s opset is 19+ for FP8 quantization, and it is 21+ for INT4 quantization. This is needed due to different opset requirements of ONNX’s [Q](https://onnx.ai/onnx/operators/onnx__QuantizeLinear.html)/[DQ](https://onnx.ai/onnx/operators/onnx__DequantizeLinear.html) nodes for INT4, FP8 data-types support. Refer to [Apply Post Training Quantization (PTQ)](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/windows_guides/_ONNX_PTQ_guide.html#apply-onnx-ptq) for details. +The quantized onnx model can be deployed using frameworks like onnxruntime. Ensure that model’s opset is 19+ for FP8 quantization, and it is 21+ for INT4 quantization. This is needed due to different opset requirements of ONNX’s [Q](https://onnx.ai/onnx/operators/onnx__QuantizeLinear.html)/[DQ](https://onnx.ai/onnx/operators/onnx__DequantizeLinear.html) nodes for INT4, FP8 data-types support. Refer to [Apply Post Training Quantization (PTQ)](https://nvidia.github.io/Model-Optimizer/guides/windows_guides/_ONNX_PTQ_guide.html#apply-onnx-ptq) for details. ```python # write steps (say, upgrade_opset() method) to upgrade or patch opset of the model, if needed @@ -117,7 +117,7 @@ onnx.save_model( ) ``` -For detailed instructions about deployment of quantized models with DirectML backend (ORT-DML), see the [DirectML](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/2_directml.html#directml-deployment). +For detailed instructions about deployment of quantized models with DirectML backend (ORT-DML), see the [DirectML](https://nvidia.github.io/Model-Optimizer/deployment/2_directml.html#directml-deployment). > [!Note] > The ready-to-deploy optimized ONNX models from ModelOpt-Windows are available at HuggingFace [NVIDIA collections](https://huggingface.co/collections/nvidia/optimized-onnx-models-for-nvidia-rtx-gpus). @@ -132,7 +132,7 @@ For detailed instructions about deployment of quantized models with DirectML bac ## Support Matrix -Please refer to [support matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html) for a full list of supported features and models. +Please refer to [support matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html) for a full list of supported features and models. ## Benchmark Results @@ -144,6 +144,6 @@ The ready-to-deploy optimized ONNX models from ModelOpt-Windows are available at ## Release Notes -Please refer to [changelog](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/0_changelog.html) +Please refer to [changelog](https://nvidia.github.io/Model-Optimizer/reference/0_changelog.html) \* *Experimental support* diff --git a/examples/windows/onnx_ptq/genai_llm/README.md b/examples/windows/onnx_ptq/genai_llm/README.md index 8c0f8e94d..b833d44dc 100644 --- a/examples/windows/onnx_ptq/genai_llm/README.md +++ b/examples/windows/onnx_ptq/genai_llm/README.md @@ -97,7 +97,7 @@ Refer to the following example scripts and tutorials for deployment: ### Model Support Matrix -Please refer to [support matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html) for a full list of supported features and models. +Please refer to [support matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html) for a full list of supported features and models. ### Troubleshoot diff --git a/modelopt/__init__.py b/modelopt/__init__.py index d0a778c03..149078279 100644 --- a/modelopt/__init__.py +++ b/modelopt/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Nvidia TensorRT Model Optimizer (modelopt).""" +"""Nvidia Model Optimizer (modelopt).""" from importlib.metadata import version as _version diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 4875fa1ad..596fde9e9 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -226,7 +226,7 @@ def register_hf_attentions_on_the_fly(model): f"Could not create a quantized attention class for {attention_cls} from this model. " "To enable KV Cache quantization, please create a custom quantized attention class for this model and " "register it to ModelOpt using `mtq.register` " - "(see https://nvidia.github.io/TensorRT-Model-Optimizer/guides/_pytorch_quantization.html#custom-quantized-module-and-quantizer-placement)" + "(see https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#custom-quantized-module-and-quantizer-placement)" ) diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 33209dbef..ec0b0eb01 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -278,7 +278,7 @@ def save_model(self, *args, **kwargs): if mto.ModeloptStateManager.is_converted(self.accelerator.unwrap_model(self.model)): print_rank_0( "Model saved. To restore, call mto.enable_huggingface_checkpointing() first before loading the " - "model. See https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.opt.plugins.huggingface.html#modelopt.torch.opt.plugins.huggingface.enable_huggingface_checkpointing" + "model. See https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.opt.plugins.huggingface.html#modelopt.torch.opt.plugins.huggingface.enable_huggingface_checkpointing" ) self.accelerator.state.fsdp_plugin.set_state_dict_type(original_type) if self.args.should_save: diff --git a/setup.py b/setup.py index 85b79e729..8a08d579f 100644 --- a/setup.py +++ b/setup.py @@ -113,11 +113,11 @@ setuptools.setup( name="nvidia-modelopt", version=version, - description="Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit.", - long_description="Checkout https://github.com/nvidia/TensorRT-Model-Optimizer for more information.", + description="Nvidia Model Optimizer: a unified model optimization and deployment toolkit.", + long_description="Checkout https://github.com/nvidia/Model-Optimizer for more information.", long_description_content_type="text/markdown", author="NVIDIA Corporation", - url="https://github.com/NVIDIA/TensorRT-Model-Optimizer", + url="https://github.com/NVIDIA/Model-Optimizer", license="Apache 2.0", license_files=("LICENSE_HEADER",), classifiers=[ diff --git a/tests/examples/README.md b/tests/examples/README.md index ed9a32f24..07c71f03a 100644 --- a/tests/examples/README.md +++ b/tests/examples/README.md @@ -9,11 +9,11 @@ Make sure to use as small models and less data as possible to keep the tests fas ## Running the tests -To run a test, start from the recommended docker image from our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html). -Then mount your local modelopt directory to `/workspace/TensorRT-Model-Optimizer` and run this from the root of the repository. +To run a test, start from the recommended docker image from our [installation docs](https://nvidia.github.io/Model-Optimizer/getting_started/2_installation.html). +Then mount your local modelopt directory to `/workspace/Model-Optimizer` and run this from the root of the repository. ```bash -cd /workspace/TensorRT-Model-Optimizer +cd /workspace/Model-Optimizer pip install -e ".[all,dev-test]" pytest tests/examples/$TEST ```