diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md new file mode 100644 index 00000000000..c0f4c555dbb --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md @@ -0,0 +1,64 @@ +This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork. + +## Requirement +```bash +pip install neural-compressor-pt==3.7 +# auto-round +pip install auto-round==0.9.1 +# vLLM +git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv +# other requirements +pip install -r requirements.txt +``` + +### Quantize Model +- Export model path +```bash +export MODEL=deepseek-ai/DeepSeek-R1 +``` + +- MXFP8 +```bash +bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels +``` + +- MXFP4 +```bash +bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels +``` + +## Evaluation + +### Prompt Tests + +Usage: +```bash +bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path] +``` + +- MXFP8 +```bash +bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8 +``` +- MXFP4 +```bash +bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4 +``` +### Evaluation + + +Usage: +```bash +bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] +``` +```bash +bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8 +bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8 + +``` +- MXFP4 +```bash +bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4 +bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4 +``` \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/generate.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/generate.py new file mode 100644 index 00000000000..ec150a71576 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/generate.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copied from https://github.com/vllm-project/vllm/ + +from vllm import LLM, EngineArgs +from vllm.utils.argparse_utils import FlexibleArgumentParser + + + +def create_parser(): + parser = FlexibleArgumentParser() + # Add engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + + return parser + + +def main(args: dict): + # Pop arguments not used by LLM + max_tokens = args.pop("max_tokens") + temperature = args.pop("temperature") + top_p = args.pop("top_p") + top_k = args.pop("top_k") + + # Create an LLM + llm = LLM(**args) + + # Create a sampling params object + sampling_params = llm.get_default_sampling_params() + if max_tokens is not None: + sampling_params.max_tokens = max_tokens + if temperature is not None: + sampling_params.temperature = temperature + if top_p is not None: + sampling_params.top_p = top_p + if top_k is not None: + sampling_params.top_k = top_k + + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + parser = create_parser() + args: dict = vars(parser.parse_args()) + main(args) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py new file mode 100644 index 00000000000..2192948ce73 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -0,0 +1,113 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +import transformers +import logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +topologies_config = { + "mxfp8": { + "scheme": "MXFP8", + "fp_layers": "lm_head", + "iters": 0, + }, + "mxfp4": { + "scheme": "MXFP4", + "fp_layers": "lm_head,self_attn", + "iters": 0, + }, +} + + +def get_model_and_tokenizer(model_name): + # Load model and tokenizer + fp32_model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map="cpu", + trust_remote_code=True, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_name, + trust_remote_code=True, + ) + return fp32_model, tokenizer + + +def quant_model(args): + from neural_compressor.torch.quantization import ( + AutoRoundConfig, + convert, + prepare, + ) + + config = topologies_config[args.t] + export_format = "auto_round" if args.use_autoround_format else "llm_compressor" + output_dir = f"{args.output_dir}/quantized_model_{args.t}" + fp32_model, tokenizer = get_model_and_tokenizer(args.model) + quant_config = AutoRoundConfig( + tokenizer=tokenizer, + scheme=config["scheme"], + enable_torch_compile=args.enable_torch_compile, + iters=config["iters"], + fp_layers=config["fp_layers"], + export_format=export_format, + output_dir=output_dir, + ) + + # quantizer execute + model = prepare(model=fp32_model, quant_config=quant_config) + inc_model = convert(model) + logger.info(f"Quantized model saved to {output_dir}") + + +if __name__ == "__main__": + import argparse + + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Select a quantization scheme.") + parser.add_argument( + "--model", + type=str, + help="Path to the pre-trained model or model identifier from Hugging Face Hub.", + ) + parser.add_argument( + "-t", + type=str, + choices=topologies_config.keys(), + default="mxfp4", + help="Quantization scheme to use. Available options: " + ", ".join(topologies_config.keys()), + ) + + parser.add_argument( + "--enable_torch_compile", + action="store_true", + help="Enable torch compile for the model.", + ) + parser.add_argument( + "--use_autoround_format", + action="store_true", + help="Use AutoRound format for saving the quantized model.", + ) + + parser.add_argument( + "--skip_attn", + action="store_true", + help="Skip quantize attention layers.", + ) + parser.add_argument( + "--iters", + type=int, + default=0, + help="Number of iterations for quantization.", + ) + parser.add_argument( + "--output_dir", + type=str, + default="./", + help="Directory to save the quantized model.", + ) + + args = parser.parse_args() + + quant_model(args) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt new file mode 100644 index 00000000000..80392549f26 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt @@ -0,0 +1,2 @@ +lm-eval==0.4.9.1 +loguru \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh new file mode 100644 index 00000000000..1d805c7872b --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -0,0 +1,120 @@ +#!/bin/bash +set -e + +# Usage: ./run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] +# Default values +MODEL_PATH="" +SCHEME="mxfp8" +TASK_NAME="piqa,hellaswag,mmlu" +TP_SIZE=8 +BATCH_SIZE=512 + +# Function to display usage +usage() { + echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]" + echo " -m: Path to the quantized model (required)" + echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)" + echo " -t: Task name(s) to evaluate (default: piqa,hellaswag,mmlu)" + echo " -tp: Tensor parallelism size (default: 8)" + echo " -b: Batch size (default: 512)" + echo "" + echo "Examples:" + echo " $0 -m /path/to/model -s mxfp4 -t gsm8k -tp 4 -b 256" + echo " $0 -m /path/to/model -s mxfp8 -t piqa,hellaswag -tp 8 -b 512" +} + +# Parse command-line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -m) + MODEL_PATH="$2" + shift 2 + ;; + -s) + SCHEME="$2" + shift 2 + ;; + -t) + TASK_NAME="$2" + shift 2 + ;; + -tp) + TP_SIZE="$2" + shift 2 + ;; + -b) + BATCH_SIZE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Invalid option: $1" >&2 + usage + exit 1 + ;; + esac +done + +# Validate required arguments +if [[ -z "$MODEL_PATH" ]]; then + echo "Error: Model path (-m) is required." + usage + exit 1 +fi + +# Extract model name and set output directory +MODEL_NAME=$(basename ${MODEL_PATH}) +OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval" + +# Create output directory +mkdir -p ${OUTPUT_DIR} + +# Set environment variables based on the quantization scheme +if [[ "$SCHEME" == "mxfp4" ]]; then + VLLM_AR_MXFP4_MODULAR_MOE=1 + VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 + VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + VLLM_ENABLE_STATIC_MOE=0 + VLLM_USE_DEEP_GEMM=0 + VLLM_ENABLE_AR_EXT=1 +elif [[ "$SCHEME" == "mxfp8" ]]; then + VLLM_AR_MXFP4_MODULAR_MOE=0 + VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + VLLM_ENABLE_STATIC_MOE=0 + VLLM_USE_DEEP_GEMM=0 + VLLM_ENABLE_AR_EXT=1 +else + echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4' or 'mxfp8'." + usage + exit 1 +fi + +# Run evaluation +echo "Evaluating model: ${MODEL_PATH}" +echo "Quantization scheme: ${SCHEME}" +echo "Tasks: ${TASK_NAME}" +echo "Tensor parallelism size: ${TP_SIZE}" +echo "Batch size: ${BATCH_SIZE}" +echo "Output directory: ${OUTPUT_DIR}" + +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \ +VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \ +VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \ +VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \ +VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \ +VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \ +VLLM_ENABLE_V1_MULTIPROCESSING=1 \ +lm_eval --model vllm \ + --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False" \ + --tasks $TASK_NAME \ + --batch_size $BATCH_SIZE \ + --log_samples \ + --limit 64 \ + --seed 42 \ + --output_path ${OUTPUT_DIR} \ + --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh new file mode 100644 index 00000000000..c9ee73ce182 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh @@ -0,0 +1,118 @@ +#!/bin/bash +set -e + +# Model Testing Script +# Usage: ./run_generate.sh -s [mxfp4|mxfp8] -m [model_path] -tp [tensor_parallel_size] + +# Default values +QUANT_TYPE="mxfp8" +MODEL_PATH="/path/to/quantized_model" +TP_SIZE=8 + +# Function to display usage +usage() { + echo "Usage: $0 -s [mxfp4|mxfp8] -m [model_path] -tp [tensor_parallel_size]" + echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)" + echo " -m: Path to quantized model (required)" + echo " -tp: Tensor parallelism size (default: 8)" + echo "" + echo "Examples:" + echo " $0 -s mxfp4 -m /path/to/my/model -tp 4" + echo " $0 -m /path/to/my/model" + echo " $0 -s mxfp8 -m /path/to/my/model" +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -s) + QUANT_TYPE="$2" + shift 2 + ;; + -m) + MODEL_PATH="$2" + shift 2 + ;; + -tp) + TP_SIZE="$2" + shift 2 + ;; + -h) + usage + exit 0 + ;; + *) + echo "Invalid option: $1" >&2 + usage + exit 1 + ;; + esac +done + + +# Validate quantization type +QUANT_TYPE_UPPER=$(echo "$QUANT_TYPE" | tr '[:lower:]' '[:upper:]') +if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" ]]; then + echo "Error: Quantization type must be mxfp4 or mxfp8" + usage + exit 1 +fi + +# Validate model path +if [[ "$MODEL_PATH" == "/path/to/quantized_model" ]]; then + echo "Error: Model path is required (-m)" + usage + exit 1 +fi + +if [[ ! -d "$MODEL_PATH" ]]; then + echo "Error: Model path '$MODEL_PATH' does not exist or is not a directory" + exit 1 +fi + +# Validate TP_SIZE is a number +if ! [[ "$TP_SIZE" =~ ^[0-9]+$ ]] || [ "$TP_SIZE" -lt 1 ]; then + echo "Error: Tensor parallelism size must be a positive integer" + exit 1 +fi + +echo "Running $QUANT_TYPE_UPPER test with:" +echo " Model: $MODEL_PATH" +echo " Tensor Parallelism: $TP_SIZE" +echo "" + +# Set environment variables based on quantization type +if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then + export VLLM_AR_MXFP4_MODULAR_MOE=1 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 + echo "Using MXFP4 configuration" +else + export VLLM_AR_MXFP4_MODULAR_MOE=0 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + echo "Using MXFP8 configuration" +fi + +# Common environment variables +export VLLM_ENABLE_AR_EXT=1 +export VLLM_ENABLE_STATIC_MOE=0 +export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 +export VLLM_USE_DEEP_GEMM=0 +export VLLM_ENABLE_V1_MULTIPROCESSING=0 + +echo "Environment variables set:" +echo " VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE" +echo " VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8" +echo " VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT" +echo "" + +# Run the model +echo "Starting model generation..." +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +python generate.py \ + --model "${MODEL_PATH}" \ + --tensor_parallel_size $TP_SIZE \ + --max-tokens 16 \ + --max-num-seqs 4 \ + --gpu_memory_utilization 0.75 \ + --no-enable-prefix-caching \ + --enable_expert_parallel \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh new file mode 100644 index 00000000000..435132a97f2 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +MODEL="" +TARGET="" +OUTPUT_DIR="" + +usage() { + echo "Usage: $0 --model MODEL -t [mxfp4|mxfp8] --output_dir DIR" + echo " --model Hugging Face model ID or local path" + echo " -t quantization target (e.g. mxfp8, mxfp4)" + echo " --output_dir output directory for quantized model" + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --model) + MODEL="$2" + shift 2 + ;; + -t) + TARGET="$2" + shift 2 + ;; + --output_dir) + OUTPUT_DIR="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +[ -z "$MODEL" ] && echo "Error: --model is required" && usage +[ -z "$TARGET" ] && echo "Error: -t is required" && usage +[ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage + +python quantize.py \ + --model "$MODEL" \ + -t "$TARGET" \ + --use_autoround_format \ + --output_dir "$OUTPUT_DIR" \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md new file mode 100644 index 00000000000..8f8b17af102 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md @@ -0,0 +1,67 @@ +This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork. + +## Requirement +```bash +pip install neural-compressor-pt==3.7 +# auto-round +pip install auto-round==0.9.1 +# vLLM +git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv +# other requirements +pip install -r requirements.txt +``` + +### Quantize Model +- Export model path +```bash +export MODEL=Qwen/Qwen3-235B-A22B +``` +> [!TIP] +> For quicker experimentation (shorter quantization and evaluation time, lower memory), +> you can start with the smaller `export MODEL=Qwen/Qwen3-30B-A3B` model before moving to larger variants. + +- MXFP8 +```bash +bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels +``` + +- MXFP4 +```bash +bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels +``` + +## Evaluation + +### Prompt Tests + +Usage: +```bash +bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path] +``` + +- MXFP8 +```bash +bash ./run_generate.sh -s mxfp8 -tp 4 -m /path/to/qwen_mxfp8 +``` +- MXFP4 +```bash +bash ./run_generate.sh -s mxfp4 -tp 4 -m /path/to/qwen_mxfp4 +``` +### Evaluation + + +Usage: +```bash +bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] +``` +```bash +bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp8 +bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8 + +``` +- MXFP4 +```bash +bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp4 +bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4 +``` \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/generate.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/generate.py new file mode 100644 index 00000000000..ec150a71576 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/generate.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copied from https://github.com/vllm-project/vllm/ + +from vllm import LLM, EngineArgs +from vllm.utils.argparse_utils import FlexibleArgumentParser + + + +def create_parser(): + parser = FlexibleArgumentParser() + # Add engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + + return parser + + +def main(args: dict): + # Pop arguments not used by LLM + max_tokens = args.pop("max_tokens") + temperature = args.pop("temperature") + top_p = args.pop("top_p") + top_k = args.pop("top_k") + + # Create an LLM + llm = LLM(**args) + + # Create a sampling params object + sampling_params = llm.get_default_sampling_params() + if max_tokens is not None: + sampling_params.max_tokens = max_tokens + if temperature is not None: + sampling_params.temperature = temperature + if top_p is not None: + sampling_params.top_p = top_p + if top_k is not None: + sampling_params.top_k = top_k + + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + parser = create_parser() + args: dict = vars(parser.parse_args()) + main(args) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py new file mode 100644 index 00000000000..0b5bf23ab6e --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py @@ -0,0 +1,113 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +import transformers +import logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +topologies_config = { + "mxfp8": { + "scheme": "MXFP8", + "fp_layers": "lm_head,mlp.gate", + "iters": 0, + }, + "mxfp4": { + "scheme": "MXFP4", + "fp_layers": "lm_head,mlp.gate,self_attn", + "iters": 200, + }, +} + + +def get_model_and_tokenizer(model_name): + # Load model and tokenizer + fp32_model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map="cpu", + trust_remote_code=True, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_name, + trust_remote_code=True, + ) + return fp32_model, tokenizer + + +def quant_model(args): + from neural_compressor.torch.quantization import ( + AutoRoundConfig, + convert, + prepare, + ) + + config = topologies_config[args.t] + export_format = "auto_round" if args.use_autoround_format else "llm_compressor" + output_dir = f"{args.output_dir}/quantized_model_{args.t}" + fp32_model, tokenizer = get_model_and_tokenizer(args.model) + quant_config = AutoRoundConfig( + tokenizer=tokenizer, + scheme=config["scheme"], + enable_torch_compile=args.enable_torch_compile, + iters=config["iters"], + fp_layers=config["fp_layers"], + export_format=export_format, + output_dir=output_dir, + ) + + # quantizer execute + model = prepare(model=fp32_model, quant_config=quant_config) + inc_model = convert(model) + logger.info(f"Quantized model saved to {output_dir}") + + +if __name__ == "__main__": + import argparse + + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Select a quantization scheme.") + parser.add_argument( + "--model", + type=str, + help="Path to the pre-trained model or model identifier from Hugging Face Hub.", + ) + parser.add_argument( + "-t", + type=str, + choices=topologies_config.keys(), + default="mxfp4", + help="Quantization scheme to use. Available options: " + ", ".join(topologies_config.keys()), + ) + + parser.add_argument( + "--enable_torch_compile", + action="store_true", + help="Enable torch compile for the model.", + ) + parser.add_argument( + "--use_autoround_format", + action="store_true", + help="Use AutoRound format for saving the quantized model.", + ) + + parser.add_argument( + "--skip_attn", + action="store_true", + help="Skip quantize attention layers.", + ) + parser.add_argument( + "--iters", + type=int, + default=0, + help="Number of iterations for quantization.", + ) + parser.add_argument( + "--output_dir", + type=str, + default="./", + help="Directory to save the quantized model.", + ) + + args = parser.parse_args() + + quant_model(args) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/requirements.txt new file mode 100644 index 00000000000..80392549f26 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/requirements.txt @@ -0,0 +1,2 @@ +lm-eval==0.4.9.1 +loguru \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh new file mode 100644 index 00000000000..1d805c7872b --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -0,0 +1,120 @@ +#!/bin/bash +set -e + +# Usage: ./run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] +# Default values +MODEL_PATH="" +SCHEME="mxfp8" +TASK_NAME="piqa,hellaswag,mmlu" +TP_SIZE=8 +BATCH_SIZE=512 + +# Function to display usage +usage() { + echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]" + echo " -m: Path to the quantized model (required)" + echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)" + echo " -t: Task name(s) to evaluate (default: piqa,hellaswag,mmlu)" + echo " -tp: Tensor parallelism size (default: 8)" + echo " -b: Batch size (default: 512)" + echo "" + echo "Examples:" + echo " $0 -m /path/to/model -s mxfp4 -t gsm8k -tp 4 -b 256" + echo " $0 -m /path/to/model -s mxfp8 -t piqa,hellaswag -tp 8 -b 512" +} + +# Parse command-line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -m) + MODEL_PATH="$2" + shift 2 + ;; + -s) + SCHEME="$2" + shift 2 + ;; + -t) + TASK_NAME="$2" + shift 2 + ;; + -tp) + TP_SIZE="$2" + shift 2 + ;; + -b) + BATCH_SIZE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Invalid option: $1" >&2 + usage + exit 1 + ;; + esac +done + +# Validate required arguments +if [[ -z "$MODEL_PATH" ]]; then + echo "Error: Model path (-m) is required." + usage + exit 1 +fi + +# Extract model name and set output directory +MODEL_NAME=$(basename ${MODEL_PATH}) +OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval" + +# Create output directory +mkdir -p ${OUTPUT_DIR} + +# Set environment variables based on the quantization scheme +if [[ "$SCHEME" == "mxfp4" ]]; then + VLLM_AR_MXFP4_MODULAR_MOE=1 + VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 + VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + VLLM_ENABLE_STATIC_MOE=0 + VLLM_USE_DEEP_GEMM=0 + VLLM_ENABLE_AR_EXT=1 +elif [[ "$SCHEME" == "mxfp8" ]]; then + VLLM_AR_MXFP4_MODULAR_MOE=0 + VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + VLLM_ENABLE_STATIC_MOE=0 + VLLM_USE_DEEP_GEMM=0 + VLLM_ENABLE_AR_EXT=1 +else + echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4' or 'mxfp8'." + usage + exit 1 +fi + +# Run evaluation +echo "Evaluating model: ${MODEL_PATH}" +echo "Quantization scheme: ${SCHEME}" +echo "Tasks: ${TASK_NAME}" +echo "Tensor parallelism size: ${TP_SIZE}" +echo "Batch size: ${BATCH_SIZE}" +echo "Output directory: ${OUTPUT_DIR}" + +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \ +VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \ +VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \ +VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \ +VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \ +VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \ +VLLM_ENABLE_V1_MULTIPROCESSING=1 \ +lm_eval --model vllm \ + --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False" \ + --tasks $TASK_NAME \ + --batch_size $BATCH_SIZE \ + --log_samples \ + --limit 64 \ + --seed 42 \ + --output_path ${OUTPUT_DIR} \ + --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_generate.sh new file mode 100644 index 00000000000..c9ee73ce182 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_generate.sh @@ -0,0 +1,118 @@ +#!/bin/bash +set -e + +# Model Testing Script +# Usage: ./run_generate.sh -s [mxfp4|mxfp8] -m [model_path] -tp [tensor_parallel_size] + +# Default values +QUANT_TYPE="mxfp8" +MODEL_PATH="/path/to/quantized_model" +TP_SIZE=8 + +# Function to display usage +usage() { + echo "Usage: $0 -s [mxfp4|mxfp8] -m [model_path] -tp [tensor_parallel_size]" + echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)" + echo " -m: Path to quantized model (required)" + echo " -tp: Tensor parallelism size (default: 8)" + echo "" + echo "Examples:" + echo " $0 -s mxfp4 -m /path/to/my/model -tp 4" + echo " $0 -m /path/to/my/model" + echo " $0 -s mxfp8 -m /path/to/my/model" +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -s) + QUANT_TYPE="$2" + shift 2 + ;; + -m) + MODEL_PATH="$2" + shift 2 + ;; + -tp) + TP_SIZE="$2" + shift 2 + ;; + -h) + usage + exit 0 + ;; + *) + echo "Invalid option: $1" >&2 + usage + exit 1 + ;; + esac +done + + +# Validate quantization type +QUANT_TYPE_UPPER=$(echo "$QUANT_TYPE" | tr '[:lower:]' '[:upper:]') +if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" ]]; then + echo "Error: Quantization type must be mxfp4 or mxfp8" + usage + exit 1 +fi + +# Validate model path +if [[ "$MODEL_PATH" == "/path/to/quantized_model" ]]; then + echo "Error: Model path is required (-m)" + usage + exit 1 +fi + +if [[ ! -d "$MODEL_PATH" ]]; then + echo "Error: Model path '$MODEL_PATH' does not exist or is not a directory" + exit 1 +fi + +# Validate TP_SIZE is a number +if ! [[ "$TP_SIZE" =~ ^[0-9]+$ ]] || [ "$TP_SIZE" -lt 1 ]; then + echo "Error: Tensor parallelism size must be a positive integer" + exit 1 +fi + +echo "Running $QUANT_TYPE_UPPER test with:" +echo " Model: $MODEL_PATH" +echo " Tensor Parallelism: $TP_SIZE" +echo "" + +# Set environment variables based on quantization type +if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then + export VLLM_AR_MXFP4_MODULAR_MOE=1 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 + echo "Using MXFP4 configuration" +else + export VLLM_AR_MXFP4_MODULAR_MOE=0 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + echo "Using MXFP8 configuration" +fi + +# Common environment variables +export VLLM_ENABLE_AR_EXT=1 +export VLLM_ENABLE_STATIC_MOE=0 +export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 +export VLLM_USE_DEEP_GEMM=0 +export VLLM_ENABLE_V1_MULTIPROCESSING=0 + +echo "Environment variables set:" +echo " VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE" +echo " VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8" +echo " VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT" +echo "" + +# Run the model +echo "Starting model generation..." +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +python generate.py \ + --model "${MODEL_PATH}" \ + --tensor_parallel_size $TP_SIZE \ + --max-tokens 16 \ + --max-num-seqs 4 \ + --gpu_memory_utilization 0.75 \ + --no-enable-prefix-caching \ + --enable_expert_parallel \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh new file mode 100644 index 00000000000..435132a97f2 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +MODEL="" +TARGET="" +OUTPUT_DIR="" + +usage() { + echo "Usage: $0 --model MODEL -t [mxfp4|mxfp8] --output_dir DIR" + echo " --model Hugging Face model ID or local path" + echo " -t quantization target (e.g. mxfp8, mxfp4)" + echo " --output_dir output directory for quantized model" + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --model) + MODEL="$2" + shift 2 + ;; + -t) + TARGET="$2" + shift 2 + ;; + --output_dir) + OUTPUT_DIR="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +[ -z "$MODEL" ] && echo "Error: --model is required" && usage +[ -z "$TARGET" ] && echo "Error: -t is required" && usage +[ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage + +python quantize.py \ + --model "$MODEL" \ + -t "$TARGET" \ + --use_autoround_format \ + --output_dir "$OUTPUT_DIR" \ No newline at end of file