Merge pull request #60 from foundation-model-stack/test_decoders_configuration

JRosenkranz · web-flow · commit 596bacafa15f · 2025-07-09T10:04:43.000-04:00
added an option to run test_decoders from a model configuration file
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -22,7 +22,7 @@
     sample_sharegpt_requests,
     ids_for_prompt,
 )
-
+import json
 from aiu_fms_testing_utils.utils.aiu_setup import dprint, aiu_dist_setup
 
 import os
@@ -83,6 +83,9 @@
         LLAMA_3p1_70B_INSTRUCT,
     ],
 )
+model_configuration_path = os.environ.get("FMS_TEST_SHAPES_FROM_MODEL_CONFIGURATION", "")
+model_configuration_frequency = os.environ.get("FMS_TEST_SHAPES_FROM_MODEL_CONFIGURATION_FREQUENCY", "0")
+
 # for validation level 1, the default is a failure rate of 1%
 # set this environment variable if you would like to relax that threshold
 failure_rate_threshold = os.environ.get("FMS_TEST_SHAPES_FAILURE_THRESHOLD", 0.01)
@@ -145,15 +148,6 @@
     os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str((((max(common_seq_lengths) + max(common_max_new_tokens)) // 64) + 1) * 64)
     os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(common_batch_sizes))
 
-common_shapes = list(
-    itertools.product(
-        common_model_paths,
-        common_batch_sizes,
-        common_seq_lengths,
-        common_max_new_tokens,
-    )
-)
-
 # thresholds are chosen based on 1024 tokens per sequence
 # 1% error threshold rate between cpu fp32 and cuda fp16
 # if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above
@@ -180,6 +174,32 @@
         0.0044301633024588115,
     ),
 }
+
+if model_configuration_path != "":
+    print("ignoring FMS_TEST_SHAPES_COMMON_MODEL_PATHS, FMS_TEST_SHAPES_USE_MICRO_MODELS as configuration will be set by FMS_TEST_SHAPES_FROM_MODEL_CONFIGURATION")
+    USE_MICRO_MODELS = False
+    common_model_paths = []
+    frequency = int(model_configuration_frequency)
+    with open(model_configuration_path, 'r') as f:
+        for line in f:
+            try:
+                model_config = json.loads(line)
+                if model_config["frequency"] <= frequency:
+                    common_model_paths.append(model_config["model_id"])
+                    # assume fullsize models
+                    fail_thresholds[(model_config["model_id"], USE_MICRO_MODELS)] = (model_config["ce"], model_config["mean_diff"])
+            except json.JSONDecodeError:
+                print(f"config contained an improper json line: {line.strip()}")
+
+common_shapes = list(
+    itertools.product(
+        common_model_paths,
+        common_batch_sizes,
+        common_seq_lengths,
+        common_max_new_tokens,
+    )
+)
+
 # custom weight adaptation to be used in future. For instance if we would like to add some other adaptation, we can register it with this custom adapter
 # and provide it when converting from an aiu fms model's weights to a cpu fms model's weights. Currently this is only done for gptq, but may be done for other
 # formats in the future
diff --git a/tests/resources/test_decoders_config.json b/tests/resources/test_decoders_config.json
@@ -0,0 +1,3 @@
+{"model_id": "mistralai/Mistral-7B-Instruct-v0.3", "ce": 2.8364005851745624, "mean_diff": 0.0007839603102183846, "frequency": 2}
+/* FIXME: proper mean_diff, currently using from granite 3.1 8b instruct */
+{"model_id": "ibm-granite/granite-guardian-3.1-8b", "ce": 2.493684446811673, "mean_diff": 0.0005767398688476533, "frequency": 1}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"model_id": "mistralai/Mistral-7B-Instruct-v0.3", "ce": 2.8364005851745624, "mean_diff": 0.0007839603102183846, "frequency": 2}`
	`2`	`+/* FIXME: proper mean_diff, currently using from granite 3.1 8b instruct */`
	`3`	`+{"model_id": "ibm-granite/granite-guardian-3.1-8b", "ce": 2.493684446811673, "mean_diff": 0.0005767398688476533, "frequency": 1}`