Consolidated Llama Guard 4 Text testing scripts and raised NotImplemented error for multimodal inputs

JiriesKaileh · JiriesKaileh · commit b6d2545b5922 · 2025-11-20T00:27:36.000Z
diff --git a/examples/offline_safety_model_inference.py b/examples/offline_safety_model_inference.py
@@ -2,9 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 """
-Example script for running offline safety classification inference on Llama Guard 4.
+Example script for running offline safety classification inference on safety models.
 
-applies the Llama Guard 4 chat template to 35 prompts from the ailuminate dataset, 
+Currently supported models: 
+ - Llama Guard 4 (meta-llama/Llama-Guard-4-12B)
+
+applies the safety model's chat template to 35 prompts from the ailuminate dataset, 
 and runs inference using the JAX backend. It calculates the final accuracy based on 
 the model's 'safe'/'unsafe' and S-code classification.
 
@@ -59,7 +62,6 @@ def create_parser():
 
     # Add sampling params
     sampling_group = parser.add_argument_group("Sampling parameters")
-    # For Llama Guard, we want deterministic output.
     sampling_group.add_argument("--max-tokens", type=int, default=128)
     sampling_group.add_argument("--temperature", type=float, default=0.0)
     sampling_group.add_argument("--top-p", type=float, default=1.0)
@@ -68,7 +70,6 @@ def create_parser():
 
 
 def main(args: dict):
-    # Pop arguments not used by LLM
     max_tokens = args.pop("max_tokens")
     temperature = args.pop("temperature")
     top_p = args.pop("top_p")
diff --git a/scripts/vllm/integration/test_safety_model_accuracy.py b/scripts/vllm/integration/test_safety_model_accuracy.py
@@ -15,7 +15,7 @@
 import pandas as pd
 from tpu_inference.core import disagg_utils 
 
-# --- HAZARD MAPPING (REQUIRED FOR LABEL CONVERSION) ---
+# --- HAZARD MAPPING ---
 # This mapping is based on the MLCommons taxonomy and is needed for the data loader.
 # The ailuminate test set is a standard dataset for safety models
 HAZARD_MAPPING = {
@@ -109,7 +109,6 @@ def safety_test_data_loader(request):
     dataset_path = request.config.getoption("--dataset-path")
     
     if not dataset_path:
-        # This is expected in CI if the benchmark task definition isn't passing it yet.
         pytest.skip("The --dataset-path argument is required for data loading.")
 
     # Load the full dataset (Pandas handles GCS URI)
diff --git a/tests/e2e/benchmarking/safety_model_benchmark.sh b/tests/e2e/benchmarking/safety_model_benchmark.sh
@@ -22,7 +22,6 @@
 set -e
 
 # --- Configuration & Defaults ---
-# Variables now rely on being set in the environment (e.g., via export or CI YAML)
 MODEL_NAME="${TEST_MODEL}"
 TP_SIZE="${TENSOR_PARALLEL_SIZE}"
 
@@ -96,7 +95,6 @@ else
 fi
 
 # Convert to JSONL to be compatible with vllm bench serve command
-# TODO: ensure this conversion works
 if [ ! -f "$LOCAL_JSONL_FILE" ] || [ "$TEST_MODE" == "performance" ]; then
     echo "Converting CSV to JSONL for performance run..."
     
@@ -131,21 +129,14 @@ fi
 run_accuracy_check() {
     echo -e "\n--- Running Accuracy Check (Mode: ACCURACY) ---"
     
-    # 1. Define the correct execution directory for conftest.py discovery
-    #CONFTEST_DIR="/workspace/tpu-inference/scripts/vllm/integration"
-    CONFTEST_DIR="/mnt/disks/jiries-disk_data/tpu-inference/scripts/vllm/integration"
+    CONFTEST_DIR="/workspace/tpu-inference/scripts/vllm/integration"
 
-    # 2. Calculate the relative path from $CONFTEST_DIR to the test file.
-    # We must go up three levels and then down into the test folder.
     RELATIVE_TEST_FILE="test_safety_model_accuracy.py"
     
-    # 3. Directory Change and Pytest Execution (in a subshell)
     (
-        # Change to the directory containing conftest.py
         cd "$CONFTEST_DIR" || { echo "Error: Failed to find conftest directory: $CONFTEST_DIR"; exit 1; }
         echo "Running pytest from: $(pwd)"
         
-        # Execute Pytest, running the test file using the relative path
         python -m pytest -s -rP "$RELATIVE_TEST_FILE::test_safety_model_accuracy_check" \
             --tensor-parallel-size="$TP_SIZE" \
             --model-name="$MODEL_NAME" \
@@ -160,7 +151,6 @@ run_accuracy_check() {
 run_performance_benchmark() {
     echo -e "\n--- Running Performance Benchmark (Mode: PERFORMANCE) ---"
 
-    # 1. Benchmark Execution (against the running server)
     vllm bench serve \
         --model "$MODEL_NAME" \
         --endpoint "/v1/completions" \
@@ -171,7 +161,6 @@ run_performance_benchmark() {
         --custom-output-len "$OUTPUT_LEN_OVERRIDE" \
         2>&1 | tee "$BENCHMARK_LOG_FILE"
 
-    # 2. Check throughput metric from the log file
     ACTUAL_THROUGHPUT=$(awk '/Output token throughput \(tok\/s\):/ {print $NF}' "$BENCHMARK_LOG_FILE")
 
     if [ -z "$ACTUAL_THROUGHPUT" ]; then
@@ -181,7 +170,6 @@ run_performance_benchmark() {
     
     echo "Actual Output Token Throughput: $ACTUAL_THROUGHPUT tok/s"
     
-    # 3. Perform float comparison
     if awk -v actual="$ACTUAL_THROUGHPUT" -v target="$TARGET_THROUGHPUT" 'BEGIN { exit !(actual >= target) }'; then
         echo "PERFORMANCE CHECK PASSED: $ACTUAL_THROUGHPUT >= $TARGET_THROUGHPUT"
         return 0
@@ -196,29 +184,27 @@ run_performance_benchmark() {
 # Set initial trap to ensure cleanup happens even on immediate exit
 trap 'cleanUp "$MODEL_NAME"' EXIT
 
-# --- 1. RUN TEST MODE (Offline Accuracy) ---
+# --- 1. RUN TEST MODE  ---
 if [ "$TEST_MODE" == "accuracy" ]; then
     run_accuracy_check
     EXIT_CODE=$?
-    # Exit immediately after offline test, as server setup is unnecessary
+
     exit $EXIT_CODE
 fi
 
 # --- 2. START SERVER (Required ONLY for Performance Mode) ---
 if [ "$TEST_MODE" == "performance" ]; then
     echo "Spinning up the vLLM server for $MODEL_NAME (TP=$TP_SIZE)..."
 
-    # Server startup (NOTE: No SKIP_JAX_PRECOMPILE=1 here)
+    # Server startup
     (vllm serve "$MODEL_NAME" \
         --tensor-parallel-size "$TP_SIZE" \
         --max-model-len="$MAX_MODEL_LEN" \
         --max-num-batched-tokens="$MAX_BATCHED_TOKENS" \
         2>&1 | tee -a "$LOG_FILE") &
 
-    # WAIT FOR SERVER (Shared Function Call)
-    waitForServerReady # Exits 1 on timeout
+    waitForServerReady
 
-    # Execute performance test
     run_performance_benchmark
     EXIT_CODE=$?
 fi
diff --git a/tpu_inference/models/jax/llama_guard_4.py b/tpu_inference/models/jax/llama_guard_4.py
@@ -1,15 +1,5 @@
 from tpu_inference.logger import init_logger
 
-logger = init_logger(__name__)
-
-# --- CRITICAL FIX: Add logger.warning() call here ---
-logger.warning(
-    "🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨\n"
-    "Llama Guard 4 (JAX) is WIP: Only the text modality is currently implemented. "
-    "Multimodal inputs will fail.\n"
-    "🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨"
-)
-
 import re
 from typing import List, Optional, Tuple, Any
 
@@ -35,6 +25,13 @@
 
 logger = init_logger(__name__)
 
+logger.warning(
+    "🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨\n"
+    "Llama Guard 4 (JAX) is WIP: Only the text modality is currently implemented. "
+    "Multimodal inputs will fail.\n"
+    "🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨 🚨🚨🚨WARNING🚨🚨🚨"
+)
+
 class LlamaGuard4ForCausalLM(nnx.Module):
 
     def __init__(self,
diff --git a/tpu_inference/runner/tpu_runner.py b/tpu_inference/runner/tpu_runner.py
@@ -508,7 +508,9 @@ def load_model(self):
         self.is_multimodal_model = (self.model_config.is_multimodal_model
                                     and self.get_multimodal_embeddings_fn
                                     is not None 
-                                    and self.model_config.hf_config.architectures[0] != "Llama4ForConditionalGeneration" ) #TODO: Remove Llama Guard 4 specific condition once the LG4 Vision portion is implemented
+                                    and hasattr(self.model_config.hf_config, "architectures") #TODO: Remove Llama Guard 4 specific condition once the LG4 Vision portion is implemented
+                                    and len(self.model_config.hf_config.architectures) >= 1
+                                    and self.model_config.hf_config.architectures[0] != "Llama4ForConditionalGeneration" ) 
 
         logger.info(f"Init model | "
                     f"hbm={common_utils.hbm_usage_gb(self.devices)}GiB")
@@ -695,13 +697,23 @@ def _execute_model(
             logits_indices_selector,
         ) = self._prepare_inputs(scheduler_output)
 
+        is_llama_guard_4 = ( hasattr(self.model_config.hf_config, "architectures") #TODO: Remove Llama Guard 4 specific condition once the LG4 Vision portion is implemented
+                                    and len(self.model_config.hf_config.architectures) >= 1
+                                    and self.model_config.hf_config.architectures[0] == "Llama4ForConditionalGeneration" ) 
+
         # multi-modal support
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
             # We have the modality embeds at this time.
             self.mm_manager.execute_mm_encoder(scheduler_output)
             mm_embeds = self.mm_manager.gather_mm_embeddings(
                 scheduler_output, input_ids.shape[0])
+        #TODO: Remove the follow elif statement once Llama Guard 4 Vision portion has been implemented
+        elif is_llama_guard_4 and any(self.mm_manager.runner.requests[req_id].mm_features for req_id in self.mm_manager.runner.input_batch.req_ids):
+                    raise NotImplementedError(
+                        "Llama Guard 4 (JAX) currently supports only text inputs. "
+                        "Multimodal processing via 'inputs_embeds' is not yet implemented."
+                    )
         else:
             mm_embeds = []