Merge branch 'main' into generate_metrics_layers

flaviabeo · flaviabeo · commit c430ecdd7b38 · 2025-06-19T19:21:49.000-03:00
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -34,33 +34,53 @@
 except ImportError:
     GPTQ_ENABLED = False
 
-ORIGINAL_HF_HOME = os.environ.get("HF_HOME", None)
+MICRO_MODELS_HOME = os.environ.get("FMS_TEST_SHAPES_MICRO_MODELS_HOME", "/mnt/home/models/tiny-models")
 
 # Add models to test here
 LLAMA_3p1_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct"
 GRANITE_3p2_8B_INSTRUCT = "ibm-granite/granite-3.2-8b-instruct"
+GRANITE_3p3_8B_INSTRUCT = "ibm-granite/granite-3.3-8b-instruct"
 GRANITE_20B_CODE_INSTRUCT_8K = "ibm-granite/granite-20b-code-instruct-8k"
 LLAMA_3p1_70B_INSTRUCT = "meta-llama/Llama-3.1-70B-Instruct"
+MISTRAL_0p3_7B_INSTRUCT = "mistralai/Mistral-7B-Instruct-v0.3"
+
+micro_model_mapping = {
+    LLAMA_3p1_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000"),
+    GRANITE_3p2_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"),
+    # FIXME: Because this uses the same config as 3.2, re-using here, but should update
+    GRANITE_3p3_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"),
+    LLAMA_3p1_70B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-70b-layers-3-step-24000")
+}
 
 SHARE_GPT_DATASET_PATH = os.environ.get(
     "SHARE_GPT_DATASET_PATH", os.path.expanduser("~/share_gpt.json")
 )
 USE_MICRO_MODELS = os.environ.get("FMS_TEST_SHAPES_USE_MICRO_MODELS", "1") == "1"
 USE_DISTRIBUTED = os.environ.get("FMS_TEST_SHAPES_DISTRIBUTED", "0") == "1"
-FORCE_VALIDATION_LEVEL_1 = os.environ.get("FMS_TEST_SHAPES_FORCE_VALIDATION_LEVEL_1", "0") == "1"
+
+FORCE_VALIDATION_LEVEL_1 = (
+    os.environ.get("FMS_TEST_SHAPES_FORCE_VALIDATION_LEVEL_1", "0") == "1"
+)
 skip_assertions = os.environ.get("FMS_TEST_SHAPES_SKIP_ASSERTIONS", {})
 validation_info_dir = os.environ.get(
     "FMS_TEST_SHAPES_VALIDATION_INFO_DIR", "/tmp/models/validation_info"
 )
 common_model_paths = os.environ.get(
     "FMS_TEST_SHAPES_COMMON_MODEL_PATHS",
-    [LLAMA_3p1_8B_INSTRUCT, GRANITE_3p2_8B_INSTRUCT, GRANITE_20B_CODE_INSTRUCT_8K, LLAMA_3p1_70B_INSTRUCT],
+    [
+        LLAMA_3p1_8B_INSTRUCT,
+        GRANITE_3p2_8B_INSTRUCT,
+        GRANITE_3p3_8B_INSTRUCT,
+        GRANITE_20B_CODE_INSTRUCT_8K,
+        LLAMA_3p1_70B_INSTRUCT,
+        MISTRAL_0p3_7B_INSTRUCT
+    ],
 )
 # for validation level 1, the default is a failure rate of 1%
 # set this environment variable if you would like to relax that threshold
 failure_rate_threshold = os.environ.get("FMS_TEST_SHAPES_FAILURE_THRESHOLD", 0.01)
 default_metrics_threshold = os.environ.get(
-    "FMS_TEST_SHAPES_METRICS_THRESHOLD", (3.0, .001)
+    "FMS_TEST_SHAPES_METRICS_THRESHOLD", (3.0, 0.001)
 )
 save_validation_info_outputs = (
     os.environ.get("FMS_TEST_SHAPES_SAVE_VALIDATION_INFO_OUTPUTS", "0") == "1"
@@ -86,7 +106,9 @@
 
 # pass custom default metrics threshold as a comma separated str of floats <cross-entropy threshold>,<mean diff threshold>
 if isinstance(default_metrics_threshold, str):
-    default_metrics_threshold = tuple([float(m) for m in default_metrics_threshold.split(",")])
+    default_metrics_threshold = tuple(
+        [float(m) for m in default_metrics_threshold.split(",")]
+    )
 
 # pass custom common batch sizes as a comma separated str of ints
 if isinstance(common_batch_sizes, str):
@@ -124,38 +146,35 @@
 # if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above
 # threshold key is (model_id, is_tiny_model)
 fail_thresholds = {
-    (LLAMA_3p1_8B_INSTRUCT, True): (
-        3.7392955756187423,
-        .001, # FIXME: compute
-    ),
-    (GRANITE_3p2_8B_INSTRUCT, True): (
-        2.996668996810913,
-        .001, # FIXME: compute
-    ),
-    (GRANITE_20B_CODE_INSTRUCT_8K, True): (
-        3.7392955756187423, # FIXME: compute -- setting to micro llama 3.1 8b instruct
-        .001, # FIXME: compute
-    ),
-    (LLAMA_3p1_70B_INSTRUCT, True): (
-        3.8235735702514626,
-        .001, # FIXME: compute
-    ),
     (LLAMA_3p1_8B_INSTRUCT, False): (
-        2.6994638133048965,
-        0.00047589250549208347,
+        2.7080255031585696,
+        0.0004068055667448795,
     ),
     (GRANITE_3p2_8B_INSTRUCT, False): (
         2.3919514417648315,
         0.0005767398688476533,
     ),
+    (GRANITE_3p2_8B_INSTRUCT, True): (
+        2.7449850964546205,
+        0.00018840670207282534,
+    ),
+    (GRANITE_3p3_8B_INSTRUCT, False): (
+        2.4444521379470827,
+        0.0004970188625156878,
+    ),
     (GRANITE_20B_CODE_INSTRUCT_8K, False): (
-        2.640706129074097,
-        0.00034344267623964697,
+        2.646075320243838,
+        0.0003458251833217223,
     ),
+    # TODO: run llama 70B with 1,2,4,8 batches
     (LLAMA_3p1_70B_INSTRUCT, False): (
         2.841279556751251,
         0.0044301633024588115,
     ),
+    (MISTRAL_0p3_7B_INSTRUCT, False): (
+        2.846206340789795,
+        0.0008768103783950205,
+    ),
 }
 # custom weight adaptation to be used in future. For instance if we would like to add some other adaptation, we can register it with this custom adapter
 # and provide it when converting from an aiu fms model's weights to a cpu fms model's weights. Currently this is only done for gptq, but may be done for other
@@ -170,10 +189,6 @@ def reset_compiler():
     torch.compiler.reset()
     torch._dynamo.reset()
     os.environ.pop("COMPILATION_MODE", None)
-    if ORIGINAL_HF_HOME is None:
-        os.environ.pop("HF_HOME", None)
-    else:
-        os.environ["HF_HOME"] = ORIGINAL_HF_HOME
 
 
 # TODO: Currently, gptq does not have the same level of support as non-gptq models for get_model. This method provides the extra requirements for gptq for get_model,
@@ -315,9 +330,6 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens):
     torch.manual_seed(42)
     os.environ["COMPILATION_MODE"] = "offline_decoder"
 
-    if "HF_HOME" not in os.environ:
-        os.environ["HF_HOME"] = "/tmp/models/hf_cache"
-
     dprint(
         f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}"
     )
@@ -326,13 +338,18 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens):
     gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path)
     is_gptq = len(gptq_kwargs_aiu) != 0
 
-    if USE_MICRO_MODELS:
+    micro_model_path = micro_model_mapping.get(model_path, None)
+    if USE_MICRO_MODELS and micro_model_path is None:
+        dprint("using randomly initialized model")
         micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3}
     else:
+        dprint("using trained model")
         micro_model_kwargs = {"architecture": "hf_pretrained"}
 
     if not USE_MICRO_MODELS and os.path.exists(model_path):
         model_path_kwargs = {"model_path": model_path}
+    elif USE_MICRO_MODELS and micro_model_path is not None:
+        model_path_kwargs = {"model_path": micro_model_path}
     else:
         model_path_kwargs = {"variant": model_path}
 
@@ -439,10 +456,12 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
             cross_entropy = torch.nn.CrossEntropyLoss()(
                 r, t.softmax(dim=1).to(dtype=torch.float32)
             )
-            diff = torch.mean(torch.abs(
-                r.softmax(dim=1).to(dtype=torch.float32)
-                - t.softmax(dim=1).to(dtype=torch.float32)
-            ))
+            diff = torch.mean(
+                torch.abs(
+                    r.softmax(dim=1).to(dtype=torch.float32)
+                    - t.softmax(dim=1).to(dtype=torch.float32)
+                )
+            )
             return (cross_entropy, diff)
 
         iters = 1024 // max_new_tokens
@@ -510,9 +529,20 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
             # only consider those metrics captured prior to the eos
             level_1_metrics = __filter_before_eos(level_1_metrics, eos_indexes)
 
-            ce_threshold, diff_threshold = fail_thresholds.get(
-                (model_path, USE_MICRO_MODELS), default_metrics_threshold
-            )
+            # if we do not have real model weights, use a default_metrics_threshold
+            if USE_MICRO_MODELS and micro_model_path is None:
+                ce_threshold, diff_threshold = default_metrics_threshold
+            # if we have real weights, try and get the proper validation metrics threshold
+            else:
+                # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds
+                if USE_MICRO_MODELS:
+                    ce_threshold, diff_threshold = fail_thresholds.get(
+                        (model_path, True), fail_thresholds.get((model_path, False), default_metrics_threshold)
+                    )
+                else:
+                    ce_threshold, diff_threshold = fail_thresholds.get(
+                        (model_path, False), default_metrics_threshold
+                    )
 
             # get all failed responses for each metric
             ce_fail_responses = filter_failed_level_1_cases(
diff --git a/tests/models/test_encoders.py b/tests/models/test_encoders.py
@@ -10,8 +10,6 @@
 import os
 import numpy as np
 
-ORIGINAL_HF_HOME = os.environ.get("HF_HOME", None)
-
 # Add models to test here
 ROBERTA_SQUAD_V2 = "deepset/roberta-base-squad2"
 
@@ -81,17 +79,10 @@ def reset_compiler():
     torch.compiler.reset()
     torch._dynamo.reset()
     os.environ.pop('COMPILATION_MODE', None)
-    if ORIGINAL_HF_HOME is None:
-        os.environ.pop('HF_HOME', None)
-    else:
-        os.environ['HF_HOME'] = ORIGINAL_HF_HOME
 
 @pytest.mark.parametrize("model_path,batch_size,seq_length", common_shapes)
 def test_common_shapes(model_path, batch_size, seq_length):
     os.environ["COMPILATION_MODE"] = "offline"
-
-    if "HF_HOME" not in os.environ:
-        os.environ["HF_HOME"] = "/tmp/models/hf_cache"
     
     dprint(f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}")
 
diff --git a/tests/models/test_model_expectations.py b/tests/models/test_model_expectations.py
@@ -13,9 +13,6 @@
 
 os.environ["COMPILATION_MODE"] = "offline"
 
-if "HF_HOME" not in os.environ:
-    os.environ["HF_HOME"] = "/tmp/models/hf_cache"
-
 model_dir = os.environ.get("FMS_TESTING_MODEL_DIR", "/tmp/models")
 LLAMA_3p1_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct"
 GRANITE_3p2_8B_INSTRUCT = "ibm-granite/granite-3.2-8b-instruct"