Adds new metrics calculated

flaviabeo · flaviabeo · commit 9f02efca21ba · 2025-06-18T10:27:09.000-03:00
Signed-off-by: Flavia Beo &lt;flavia.beo@ibm.com&gt;
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -42,6 +42,7 @@
 GRANITE_3p3_8B_INSTRUCT = "ibm-granite/granite-3.3-8b-instruct"
 GRANITE_20B_CODE_INSTRUCT_8K = "ibm-granite/granite-20b-code-instruct-8k"
 LLAMA_3p1_70B_INSTRUCT = "meta-llama/Llama-3.1-70B-Instruct"
+MISTRAL_0p3_7B_INSTRUCT = "mistralai/Mistral-7B-Instruct-v0.3"
 
 micro_model_mapping = {
     LLAMA_3p1_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000"),
@@ -72,6 +73,7 @@
         GRANITE_3p3_8B_INSTRUCT,
         GRANITE_20B_CODE_INSTRUCT_8K,
         LLAMA_3p1_70B_INSTRUCT,
+        MISTRAL_0p3_7B_INSTRUCT
     ],
 )
 # for validation level 1, the default is a failure rate of 1%
@@ -145,25 +147,34 @@
 # threshold key is (model_id, is_tiny_model)
 fail_thresholds = {
     (LLAMA_3p1_8B_INSTRUCT, False): (
-        2.6994638133048965,
-        0.00047589250549208347,
+        2.7080255031585696,
+        0.0004068055667448795,
     ),
     (GRANITE_3p2_8B_INSTRUCT, False): (
         2.3919514417648315,
         0.0005767398688476533,
     ),
+    (GRANITE_3p2_8B_INSTRUCT, True): (
+        2.7449850964546205,
+        0.00018840670207282534,
+    ),
     (GRANITE_3p3_8B_INSTRUCT, False): (
         2.4444521379470827,
         0.0004970188625156878,
     ),
     (GRANITE_20B_CODE_INSTRUCT_8K, False): (
-        2.640706129074097,
-        0.00034344267623964697,
+        2.646075320243838,
+        0.0003458251833217223,
     ),
+    # TODO: run llama 70B with 1,2,4,8 batches
     (LLAMA_3p1_70B_INSTRUCT, False): (
         2.841279556751251,
         0.0044301633024588115,
     ),
+    (MISTRAL_0p3_7B_INSTRUCT, False): (
+        2.846206340789795,
+        0.0008768103783950205,
+    ),
 }
 # custom weight adaptation to be used in future. For instance if we would like to add some other adaptation, we can register it with this custom adapter
 # and provide it when converting from an aiu fms model's weights to a cpu fms model's weights. Currently this is only done for gptq, but may be done for other