|
34 | 34 | except ImportError: |
35 | 35 | GPTQ_ENABLED = False |
36 | 36 |
|
| 37 | +MICRO_MODELS_HOME = os.environ.get("FMS_TEST_SHAPES_MICRO_MODELS_HOME", "/mnt/home/models/tiny-models") |
| 38 | + |
37 | 39 | # Add models to test here |
38 | 40 | LLAMA_3p1_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct" |
39 | 41 | GRANITE_3p2_8B_INSTRUCT = "ibm-granite/granite-3.2-8b-instruct" |
40 | 42 | GRANITE_3p3_8B_INSTRUCT = "ibm-granite/granite-3.3-8b-instruct" |
41 | 43 | GRANITE_20B_CODE_INSTRUCT_8K = "ibm-granite/granite-20b-code-instruct-8k" |
42 | 44 | LLAMA_3p1_70B_INSTRUCT = "meta-llama/Llama-3.1-70B-Instruct" |
43 | 45 |
|
| 46 | +micro_model_mapping = { |
| 47 | + LLAMA_3p1_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000"), |
| 48 | + GRANITE_3p2_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"), |
| 49 | + # FIXME: Because this uses the same config as 3.2, re-using here, but should update |
| 50 | + GRANITE_3p3_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"), |
| 51 | + LLAMA_3p1_70B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-70b-layers-3-step-24000") |
| 52 | +} |
| 53 | + |
44 | 54 | SHARE_GPT_DATASET_PATH = os.environ.get( |
45 | 55 | "SHARE_GPT_DATASET_PATH", os.path.expanduser("~/share_gpt.json") |
46 | 56 | ) |
47 | 57 | USE_MICRO_MODELS = os.environ.get("FMS_TEST_SHAPES_USE_MICRO_MODELS", "1") == "1" |
48 | 58 | USE_DISTRIBUTED = os.environ.get("FMS_TEST_SHAPES_DISTRIBUTED", "0") == "1" |
49 | | -FORCE_VALIDATION_LEVEL_1 = os.environ.get("FMS_TEST_SHAPES_FORCE_VALIDATION_LEVEL_1", "0") == "1" |
| 59 | + |
| 60 | +FORCE_VALIDATION_LEVEL_1 = ( |
| 61 | + os.environ.get("FMS_TEST_SHAPES_FORCE_VALIDATION_LEVEL_1", "0") == "1" |
| 62 | +) |
50 | 63 | skip_assertions = os.environ.get("FMS_TEST_SHAPES_SKIP_ASSERTIONS", {}) |
51 | 64 | validation_info_dir = os.environ.get( |
52 | 65 | "FMS_TEST_SHAPES_VALIDATION_INFO_DIR", "/tmp/models/validation_info" |
53 | 66 | ) |
54 | 67 | common_model_paths = os.environ.get( |
55 | 68 | "FMS_TEST_SHAPES_COMMON_MODEL_PATHS", |
56 | | - [LLAMA_3p1_8B_INSTRUCT, GRANITE_3p2_8B_INSTRUCT, GRANITE_3p3_8B_INSTRUCT, GRANITE_20B_CODE_INSTRUCT_8K, LLAMA_3p1_70B_INSTRUCT], |
| 69 | + [ |
| 70 | + LLAMA_3p1_8B_INSTRUCT, |
| 71 | + GRANITE_3p2_8B_INSTRUCT, |
| 72 | + GRANITE_3p3_8B_INSTRUCT, |
| 73 | + GRANITE_20B_CODE_INSTRUCT_8K, |
| 74 | + LLAMA_3p1_70B_INSTRUCT, |
| 75 | + ], |
57 | 76 | ) |
58 | 77 | # for validation level 1, the default is a failure rate of 1% |
59 | 78 | # set this environment variable if you would like to relax that threshold |
60 | 79 | failure_rate_threshold = os.environ.get("FMS_TEST_SHAPES_FAILURE_THRESHOLD", 0.01) |
61 | 80 | default_metrics_threshold = os.environ.get( |
62 | | - "FMS_TEST_SHAPES_METRICS_THRESHOLD", (3.0, .001) |
| 81 | + "FMS_TEST_SHAPES_METRICS_THRESHOLD", (3.0, 0.001) |
63 | 82 | ) |
64 | 83 | save_validation_info_outputs = ( |
65 | 84 | os.environ.get("FMS_TEST_SHAPES_SAVE_VALIDATION_INFO_OUTPUTS", "0") == "1" |
|
85 | 104 |
|
86 | 105 | # pass custom default metrics threshold as a comma separated str of floats <cross-entropy threshold>,<mean diff threshold> |
87 | 106 | if isinstance(default_metrics_threshold, str): |
88 | | - default_metrics_threshold = tuple([float(m) for m in default_metrics_threshold.split(",")]) |
| 107 | + default_metrics_threshold = tuple( |
| 108 | + [float(m) for m in default_metrics_threshold.split(",")] |
| 109 | + ) |
89 | 110 |
|
90 | 111 | # pass custom common batch sizes as a comma separated str of ints |
91 | 112 | if isinstance(common_batch_sizes, str): |
|
123 | 144 | # if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above |
124 | 145 | # threshold key is (model_id, is_tiny_model) |
125 | 146 | fail_thresholds = { |
126 | | - (LLAMA_3p1_8B_INSTRUCT, True): ( |
127 | | - 3.7392955756187423, |
128 | | - .001, # FIXME: compute |
129 | | - ), |
130 | | - (GRANITE_3p2_8B_INSTRUCT, True): ( |
131 | | - 2.996668996810913, |
132 | | - .001, # FIXME: compute |
133 | | - ), |
134 | | - (GRANITE_20B_CODE_INSTRUCT_8K, True): ( |
135 | | - 3.7392955756187423, # FIXME: compute -- setting to micro llama 3.1 8b instruct |
136 | | - .001, # FIXME: compute |
137 | | - ), |
138 | | - (LLAMA_3p1_70B_INSTRUCT, True): ( |
139 | | - 3.8235735702514626, |
140 | | - .001, # FIXME: compute |
141 | | - ), |
142 | 147 | (LLAMA_3p1_8B_INSTRUCT, False): ( |
143 | 148 | 2.6994638133048965, |
144 | 149 | 0.00047589250549208347, |
@@ -322,13 +327,18 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens): |
322 | 327 | gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path) |
323 | 328 | is_gptq = len(gptq_kwargs_aiu) != 0 |
324 | 329 |
|
325 | | - if USE_MICRO_MODELS: |
| 330 | + micro_model_path = micro_model_mapping.get(model_path, None) |
| 331 | + if USE_MICRO_MODELS and micro_model_path is None: |
| 332 | + dprint("using randomly initialized model") |
326 | 333 | micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3} |
327 | 334 | else: |
| 335 | + dprint("using trained model") |
328 | 336 | micro_model_kwargs = {"architecture": "hf_pretrained"} |
329 | 337 |
|
330 | 338 | if not USE_MICRO_MODELS and os.path.exists(model_path): |
331 | 339 | model_path_kwargs = {"model_path": model_path} |
| 340 | + elif USE_MICRO_MODELS and micro_model_path is not None: |
| 341 | + model_path_kwargs = {"model_path": micro_model_path} |
332 | 342 | else: |
333 | 343 | model_path_kwargs = {"variant": model_path} |
334 | 344 |
|
@@ -435,10 +445,12 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): |
435 | 445 | cross_entropy = torch.nn.CrossEntropyLoss()( |
436 | 446 | r, t.softmax(dim=1).to(dtype=torch.float32) |
437 | 447 | ) |
438 | | - diff = torch.mean(torch.abs( |
439 | | - r.softmax(dim=1).to(dtype=torch.float32) |
440 | | - - t.softmax(dim=1).to(dtype=torch.float32) |
441 | | - )) |
| 448 | + diff = torch.mean( |
| 449 | + torch.abs( |
| 450 | + r.softmax(dim=1).to(dtype=torch.float32) |
| 451 | + - t.softmax(dim=1).to(dtype=torch.float32) |
| 452 | + ) |
| 453 | + ) |
442 | 454 | return (cross_entropy, diff) |
443 | 455 |
|
444 | 456 | iters = 1024 // max_new_tokens |
@@ -506,9 +518,20 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): |
506 | 518 | # only consider those metrics captured prior to the eos |
507 | 519 | level_1_metrics = __filter_before_eos(level_1_metrics, eos_indexes) |
508 | 520 |
|
509 | | - ce_threshold, diff_threshold = fail_thresholds.get( |
510 | | - (model_path, USE_MICRO_MODELS), default_metrics_threshold |
511 | | - ) |
| 521 | + # if we do not have real model weights, use a default_metrics_threshold |
| 522 | + if USE_MICRO_MODELS and micro_model_path is None: |
| 523 | + ce_threshold, diff_threshold = default_metrics_threshold |
| 524 | + # if we have real weights, try and get the proper validation metrics threshold |
| 525 | + else: |
| 526 | + # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds |
| 527 | + if USE_MICRO_MODELS: |
| 528 | + ce_threshold, diff_threshold = fail_thresholds.get( |
| 529 | + (model_path, True), fail_thresholds.get((model_path, False), default_metrics_threshold) |
| 530 | + ) |
| 531 | + else: |
| 532 | + ce_threshold, diff_threshold = fail_thresholds.get( |
| 533 | + (model_path, False), default_metrics_threshold |
| 534 | + ) |
512 | 535 |
|
513 | 536 | # get all failed responses for each metric |
514 | 537 | ce_fail_responses = filter_failed_level_1_cases( |
|
0 commit comments