intel · lkk12014402 · Nov 24, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/README.md
@@ -0,0 +1,70 @@
+# Whisper Inference on CPU
+
+## LEGAL DISCLAIMER
+To the extent that any data, datasets, or models are referenced by Intel or accessed using tools or code on this site such data, datasets and models are provided by the third party indicated as the source of such content. Intel does not create the data, datasets, or models, provide a license to any third-party data, datasets, or models referenced, and does not warrant their accuracy or quality. By accessing such data, dataset(s) or model(s) you agree to the terms associated with that content and that your use complies with the applicable license. 
+
+Intel expressly disclaims the accuracy, adequacy, or completeness of any data, datasets or models, and is not liable for any errors, omissions, or defects in such content, or for any reliance thereon. Intel also expressly disclaims any warranty of non-infringement with respect to such data, dataset(s), or model(s). Intel is not liable for any liability or damages relating to your use of such data, datasets, or models. 
+
+## Launch the Docker Image
+Set the directories on the host system where model, dataset, and log files will reside. These locations will retain model and data content between Docker sessions.
+```
+export DATA_DIR="${DATA_DIR:-${PWD}/data}"
+export MODEL_DIR="${MODEL_DIR:-${PWD}/model}"
+export LOG_DIR="${LOG_DIR:-${PWD}/logs}"
+```
+
+## Launch the Docker Image
+In the Host OS environment, run the following after setting the proper Docker image name. If the Docker image is not on the system already, it will be retrieved from the registry.
+
+If retrieving the model or dataset, ensure any necessary proxy settings are run inside the container.
+```
+export DOCKER_IMAGE=intel/intel-optimized-pytorch:mlperf-inference-5.1-whisper
+
+docker run --privileged -it --rm \
+        --ipc=host --net=host --cap-add=ALL \
+        -e http_proxy=${http_proxy} \
+        -e https_proxy=${https_proxy} \
+        -v ${DATA_DIR}:/data \
+        -v ${MODEL_DIR}:/model \
+        -v ${LOG_DIR}:/logs \
+        --workdir /workspace \
+        ${DOCKER_IMAGE} /bin/bash
+```
+
+## Prepare workload resources [one-time operations]
+Download the model: Run this step inside the Docker container.  This operation will preserve the model on the host system using the volume mapping above.
+```
+bash scripts/download_model.sh
+```
+Download the dataset: Run this step inside the Docker container.  This operation will preserve the dataset on the host system using the volume mapping above.
+```
+bash scripts/download_dataset.sh
+```
+Calibrate the model: Run this step inside the Docker container.  This operation will create and preserve a calibrated model along with the original model file.
+```
+bash scripts/run_calibration.sh
+```
+
+## Run Benchmark
+Run this step inside the Docker container.  Select the appropriate scenario.  If this is the first time running this workload, the original model file will be calibrated to INT8 and stored alongside the original model file (one-time operation).  The default configuration supports Intel EMR.  If running GNR, please make the following [additional changes](GNR.md).
+
+Performance::
+```
+SCENARIO=Offline MODE=Performance bash run_mlperf.sh
+```
+Accuracy:
+```
+SCENARIO=Offline MODE=Accuracy    bash run_mlperf.sh
+```
+
+## Run Compliance Tests
+Run this step inside the Docker container.  After the benchmark scenarios have been run and results exist in {LOG_DIR}/results, run this step to complete compliance runs. Compliance output will be found in '{LOG_DIR}/compliance'.
+```
+SCENARIO=Offline MODE=Compliance  bash run_mlperf.sh
+```
+
+## Validate Submission Checker
+Run this step inside the Docker container.  The following script will perform accuracy log truncation and run the submission checker on the contents of {LOG_DIR}. The source scripts are distributed as MLPerf Inference reference tools. Ensure the submission content has been populated before running.  The script output is transient and destroyed after running.  The original content of ${LOG_DIR} is not modified.
+```
+VENDOR=Intel bash prepare_submission.sh
+```
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/accuracy.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/accuracy.py
@@ -0,0 +1,98 @@
+# Copyright 2025 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import argparse
+import array
+import json
+import sys
+import os
+from typing import List
+
+from whisper.normalizers import EnglishTextNormalizer
+
+from manifest import Manifest
+from legacy_helpers import __levenshtein, __gather_predictions
+from helpers import get_expanded_wordlist
+
+
+max_duration = float(os.environ.get("MAX_DURATION", "30.0"))
+labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+dtype_map = {
+    "int8": 'b',
+    "int16": 'h',
+    "int32": 'l',
+    "int64": 'q',
+}
+
+def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
+    """
+    Computes Average Word Error rate between two texts represented as
+    corresponding lists of string. Hypotheses and references must have same length.
+
+    Args:
+        hypotheses: list of hypotheses
+        references: list of references
+
+    Returns:
+        (float) average word error rate
+    """
+    normalizer = EnglishTextNormalizer()
+
+    scores = 0
+    words = 0
+    if len(hypotheses) != len(references):
+        raise ValueError("In word error rate calculation, hypotheses and reference"
+                         " lists must have the same number of elements. But I got:"
+                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+    for h, r in zip(hypotheses, references):
+        h = normalizer(h)
+        r = normalizer(r)
+        h_list = h.split()
+        r_list = r.split()
+        h_list = get_expanded_wordlist(h_list, r_list)
+        r_list = get_expanded_wordlist(r_list, h_list)
+        words += len(r_list)
+        scores += __levenshtein(h_list, r_list)
+    wer = scores / words
+    return wer, scores, words
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", required=True)
+    parser.add_argument("--dataset_dir", required=True)
+    parser.add_argument("--manifest", required=True)
+    parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type")
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), max_duration=max_duration)
+    with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
+        results = json.load(fh)
+    hypotheses = []
+    references = []
+    for result in results:
+        hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
+        references.append(manifest[result["qsl_idx"]]["transcript"])
+
+    references = __gather_predictions([references], labels=labels)
+    hypotheses = __gather_predictions([hypotheses], labels=labels)
+
+    wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references)
+    print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100))
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/accuracy_eval.py b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/accuracy_eval.py
@@ -0,0 +1,98 @@
+# Copyright 2025 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import argparse
+import array
+import json
+import sys
+import os
+from typing import List
+
+from whisper.normalizers import EnglishTextNormalizer
+
+from manifest import Manifest
+from legacy_helpers import __levenshtein, __gather_predictions
+from helpers import get_expanded_wordlist
+
+
+max_duration = float(os.environ.get("MAX_DURATION", "30.0"))
+labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+dtype_map = {
+    "int8": 'b',
+    "int16": 'h',
+    "int32": 'l',
+    "int64": 'q',
+}
+
+def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
+    """
+    Computes Average Word Error rate between two texts represented as
+    corresponding lists of string. Hypotheses and references must have same length.
+
+    Args:
+        hypotheses: list of hypotheses
+        references: list of references
+
+    Returns:
+        (float) average word error rate
+    """
+    normalizer = EnglishTextNormalizer()
+
+    scores = 0
+    words = 0
+    if len(hypotheses) != len(references):
+        raise ValueError("In word error rate calculation, hypotheses and reference"
+                         " lists must have the same number of elements. But I got:"
+                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+    for h, r in zip(hypotheses, references):
+        h = normalizer(h)
+        r = normalizer(r)
+        h_list = h.split()
+        r_list = r.split()
+        h_list = get_expanded_wordlist(h_list, r_list)
+        r_list = get_expanded_wordlist(r_list, h_list)
+        words += len(r_list)
+        scores += __levenshtein(h_list, r_list)
+    wer = scores / words
+    return wer, scores, words
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", required=True)
+    parser.add_argument("--dataset_dir", required=True)
+    parser.add_argument("--manifest", required=True)
+    parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type")
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), max_duration=max_duration)
+    with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
+        results = json.load(fh)
+    hypotheses = []
+    references = []
+    for result in results:
+        hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
+        references.append(manifest[result["qsl_idx"]]["transcript"])
+
+    references = __gather_predictions([references], labels=labels)
+    hypotheses = __gather_predictions([hypotheses], labels=labels)
+
+    wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references)
+    print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100))
+
+if __name__ == '__main__':
+    main()
diff --git a/.../pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/calibration.md b/.../pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/calibration.md
@@ -0,0 +1,37 @@
+## Intel MLPerf Inference Calibration and Quantization Details
+
+### RetinaNet Quantization
+Model Source: https://zenodo.org/record/6617981/files/resnext50_32x4d_fpn.pth
+
+Model Quantization: FP32 -> INT8
+
+Steps: /closed/Intel/code/retinanet/pytorch-cpu/scripts/run_calibration.sh
+
+### DLRMv2 Quantization
+Model Source: https://zenodo.org/record/5597155
+
+Model Quantization: FP32 -> INT8
+
+Steps: /closed/Intel/code/dlrm-v2-99.9/pytorch-cpu/scripts/run_calibration.sh
+
+### R-GAT Quantization
+Model Source: https://github.com/IllinoisGraphBenchmark/IGB-Datasets/
+
+Model Quantization: FP32 -> INT8
+
+Implementation: /closed/Intel/code/rgat/pytorch-cpu/backend.py
+
+### Whisper Quantization
+Model Source: https://huggingface.co/openai/whisper-large-v3
+
+Model Quantization: BF16 -> INT8
+
+Details: /closed/Intel/code/whisper/pytorch-cpu/scripts/run_calibration.sh
+
+### Llama3.1-8B Quantization
+Model Source: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+
+Model Quantization: BF16 -> INT4
+
+Details: /closed/Intel/code/llama3.1-8b/pytorch-cpu/scripts/run_calibration.sh
+
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/common.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/mlperf/whisper/common.txt
@@ -0,0 +1,51 @@
+regex # Replace re for higher-performance regex matching
+cachetools
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+requests >= 2.26.0
+tqdm
+blake3
+py-cpuinfo
+transformers >= 4.55.2
+tokenizers >= 0.21.1  # Required for fast incremental detokenization.
+protobuf # Required by LlamaTokenizer.
+fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
+aiohttp
+openai >= 1.99.1  # For Responses API with reasoning content
+pydantic >= 2.11.7
+prometheus_client >= 0.18.0
+pillow  # Required for image processing
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.11.3
+llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+outlines_core == 0.2.11
+# required for outlines backend disk cache
+diskcache == 5.6.3
+lark == 1.2.2
+xgrammar == 0.1.24; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+typing_extensions >= 4.10
+filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
+partial-json-parser # used for parsing partial JSON outputs
+pyzmq >= 25.0.0
+msgspec
+gguf >= 0.13.0
+importlib_metadata; python_version < '3.10'
+mistral_common[image,audio] >= 1.8.2
+opencv-python-headless >= 4.11.0    # required for video IO
+pyyaml
+six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
+setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+einops # Required for Qwen2-VL.
+compressed-tensors == 0.11.0 # required for compressed-tensors
+depyf==0.19.0 # required for profiling and debugging with compilation config
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
+watchfiles # required for http server to monitor the updates of TLS files
+python-json-logger # Used by logging as per examples/others/logging_configuration.md
+scipy # Required for phi-4-multimodal-instruct
+ninja # Required for xgrammar, rocm, tpu, xpu
+pybase64 # fast base64 implementation
+cbor2 # Required for cross-language serialization of hashable objects
+setproctitle # Used to set process names for better debugging and monitoring
+openai-harmony >= 0.0.3  # Required for gpt-oss