From 4e611a9c02e6e23a1180467a229df843d3a600f5 Mon Sep 17 00:00:00 2001 From: "Rashed Z. Bhatti, PhD" Date: Mon, 6 Oct 2025 19:55:34 +0000 Subject: [PATCH 01/22] head_size = getattr(model.config, "head_dim", model.config.emb_dim // model.config.nheads ) Signed-off-by: Rashed Z. Bhatti, PhD --- aiu_fms_testing_utils/utils/paged.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py index edf0c548..1d6bcbc7 100644 --- a/aiu_fms_testing_utils/utils/paged.py +++ b/aiu_fms_testing_utils/utils/paged.py @@ -153,7 +153,9 @@ def generate( raise ValueError("model must have a distributed_strategy") kvheads = kvheads // tensor_parallel_size if kvheads > 1 else kvheads - head_size = model.config.emb_dim // nheads + head_size = getattr( + model.config, "head_dim", model.config.emb_dim // model.config.nheads + ) if "fp8" in kwargs["attn_name"]: from fms_mo.aiu_addons.fp8.fp8_utils import ScaledTensor From 043926cbb769799874e45123f10a4ca19232502b Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Fri, 26 Sep 2025 22:00:28 +0000 Subject: [PATCH 02/22] [dpp] store enforce_sizes in log name and added generic kwargs to get_default_validation_prefix, enable sample_key Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- aiu_fms_testing_utils/testing/utils.py | 19 +++++ aiu_fms_testing_utils/testing/validation.py | 17 ++++- aiu_fms_testing_utils/utils/__init__.py | 43 ++++++++++- scripts/drive_paged_programs.py | 9 ++- tests/testing/test_validation.py | 85 +++++++++++++++++++++ 5 files changed, 166 insertions(+), 7 deletions(-) create mode 100644 aiu_fms_testing_utils/testing/utils.py diff --git a/aiu_fms_testing_utils/testing/utils.py b/aiu_fms_testing_utils/testing/utils.py new file mode 100644 index 00000000..79ae564f --- /dev/null +++ b/aiu_fms_testing_utils/testing/utils.py @@ -0,0 +1,19 @@ +from collections.abc import Iterable + + +def format_kwargs_to_string(**kwargs): + formatted_pairs = [] + for key, value in sorted(kwargs.items()): + formatted_value = None + if isinstance(value, str): + formatted_value = value + elif isinstance(value, Iterable): + formatted_value = ",".join(map(str, value)) + elif value: + formatted_value = str(value) + # only append if formatted_value exists + if formatted_value: + # Keep previous convention of variable names with `-` instead of `_` + formatted_pairs.append(f"{key.replace('_', '-')}-{formatted_value}") + + return "_".join(formatted_pairs) diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py index 0c655ff5..5749cb89 100644 --- a/aiu_fms_testing_utils/testing/validation.py +++ b/aiu_fms_testing_utils/testing/validation.py @@ -5,6 +5,7 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint from aiu_fms_testing_utils._version import version_tuple import os +from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string class LogitsExtractorHook( @@ -132,6 +133,7 @@ def get_default_validation_prefix( dtype: str, attn_type: str, aftu_version: str, + **kwargs, ): """ Args: @@ -146,7 +148,12 @@ def get_default_validation_prefix( Returns: str: A prefix that will be prepended to the file name """ - return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}.{aftu_version}" + kwargs_str = format_kwargs_to_string(**kwargs) + + if kwargs_str == "": + return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}.{aftu_version}" + else: + return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}_{kwargs_str}.{aftu_version}" def load_validation_information( @@ -416,11 +423,14 @@ def get_validation_info_path( aftu_version: Optional[Tuple[int, int, int]] = None, device_type: str = "cpu", dtype: str = "fp16", + **kwargs, ): if aftu_version is None: aftu_version = version_tuple - validation_file_name = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]))}.{device_type}_validation_info.{seed}.out" + sample_key = kwargs.get("sample_key", None) + + validation_file_name = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" full_path = os.path.join(validation_info_dir, validation_file_name) return full_path @@ -452,10 +462,12 @@ def find_validation_info_path( version_allow_decrement: bool = False, device_type: str = "cpu", dtype: str = "fp16", + **kwargs, ): """ Find the validation info path if it exists, otherwise return None """ + enforce_sizes = kwargs.get("enforce_sizes", None) if aftu_version is None: loc_version_tuple = version_tuple[:3] @@ -476,6 +488,7 @@ def find_validation_info_path( loc_version_tuple, device_type, dtype, + enforce_sizes=enforce_sizes, ) # if the path is found, we are done searching and can return if os.path.exists(full_path): diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 65a0f9ab..796567ee 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -11,6 +11,7 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size from transformers.tokenization_utils_base import PreTrainedTokenizerBase +from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string from fms.utils.generation import pad_input_ids import torch @@ -482,6 +483,7 @@ def sample_rag_factoid_requests( enforce_sizes: List[int] = [], truncation: bool = False, pad_multiple: int = 64, + return_key: bool = False, ) -> List[Tuple[str, int]]: if not os.path.exists(dataset_path): print("error dataset does not exist") @@ -492,7 +494,7 @@ def sample_rag_factoid_requests( for line in f: dataset.append(line) - return __sample_requests( + sample_request = __sample_requests( dataset, num_requests, tokenizer, @@ -506,6 +508,24 @@ def sample_rag_factoid_requests( _cached_dataset_key=dataset_path, ) + sample_key: str = format_kwargs_to_string( + dataset="rag_factoid", + num_requests=num_requests, + tokenizer=tokenizer.name_or_path.replace("/", "--"), + prompt_length_min=prompt_length_min, + prompt_length_max=prompt_length_max, + seed=seed, + enforce_heterogeneous=enforce_heterogeneous, + enforce_sizes=enforce_sizes, + truncate=truncation, + pad_multiple=pad_multiple, + ) + + if return_key: + return sample_request, sample_key + else: + return sample_request + def sample_sharegpt_requests( dataset_path: str, @@ -518,6 +538,7 @@ def sample_sharegpt_requests( enforce_sizes: List[int] | None = None, truncation: bool = False, pad_multiple: int = 64, + return_key: bool = False, ) -> List[Tuple[str, int]]: if not os.path.exists(dataset_path): print("downloading share-gpt dataset as it does not exist") @@ -543,7 +564,7 @@ def sample_sharegpt_requests( dataset = [data for data in dataset if len(data["conversations"]) >= 2] dataset: List[str] = [data["conversations"][0]["value"] for data in dataset] - return __sample_requests( + sample_request = __sample_requests( dataset, num_requests, tokenizer, @@ -557,6 +578,24 @@ def sample_sharegpt_requests( _cached_dataset_key=dataset_path, ) + sample_key: str = format_kwargs_to_string( + dataset="sharegpt", + num_requests=num_requests, + tokenizer=tokenizer.name_or_path.replace("/", "--"), + prompt_length_min=prompt_length_min, + prompt_length_max=prompt_length_max, + seed=seed, + enforce_heterogeneous=enforce_heterogeneous, + enforce_sizes=enforce_sizes, + truncate=truncation, + pad_multiple=pad_multiple, + ) + + if return_key: + return sample_request, sample_key + else: + return sample_request + def sample_squad_v2_qa_requests( dataset_path: str, diff --git a/scripts/drive_paged_programs.py b/scripts/drive_paged_programs.py index ea51bad8..469e3ae0 100644 --- a/scripts/drive_paged_programs.py +++ b/scripts/drive_paged_programs.py @@ -245,7 +245,7 @@ def __custom_line_sampler(*args, **kwargs): def __prepare_inputs(batch_size, seq_length, tokenizer, enforce_sizes=[], seed=0): start = time.time() - prompts_and_sizes = sampler( + prompts_and_sizes, sample_key = sampler( DATASET_PATH, batch_size, tokenizer, @@ -254,6 +254,7 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, enforce_sizes=[], seed=0 seed, enforce_sizes=enforce_sizes, truncation=allow_truncation, + return_key=True, ) end = time.time() if local_rank == 0: @@ -274,7 +275,7 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, enforce_sizes=[], seed=0 input_ids, extra_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length) extra_kwargs["mask"] = extra_kwargs["mask"].to(torch.float16) - return input_ids, extra_kwargs + return input_ids, extra_kwargs, sample_key def __maybe_prepare_fp8_weights(model_in, is_fp8): @@ -367,13 +368,14 @@ def __load_validation_info( # warmup with any input so compiler produces criteria json # TODO: Swap this with __prepare_inputs once fix for shape_id is available -# input_ids, extra_kwargs = __prepare_inputs(2, max_tkv, tokenizer) +# input_ids, extra_kwargs, sample_key = __prepare_inputs(2, max_tkv, tokenizer) prompt_list = [torch.arange(0, 64, dtype=torch.int64)] # matching vllm warmup to pad to 2 on fp8, and no pad for fp16 if is_fp8: prompt_list = prompt_list * 2 input_ids, extra_kwargs = pad_input_ids(prompt_list, min_pad_length=64) extra_kwargs["mask"] = extra_kwargs["mask"].to(torch.float16) + extra_kwargs["attn_name"] = ATTN_NAME if ( "granite-3.3-8b-instruct" in model_variant @@ -657,6 +659,7 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor): 0, ATTN_NAME, dtype=CPU_DTYPE, + sample_key=sample_key, ) ) diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py index ac3367ae..b02bd19c 100644 --- a/tests/testing/test_validation.py +++ b/tests/testing/test_validation.py @@ -8,7 +8,13 @@ get_validation_info_path, find_validation_info_path, __decrement_version, + get_default_validation_prefix, ) +import os +from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string +from aiu_fms_testing_utils.utils import sample_sharegpt_requests +from transformers import AutoTokenizer + from aiu_fms_testing_utils._version import version_tuple from fms.models import get_model from fms.utils.generation import pad_input_ids @@ -238,3 +244,82 @@ def test_decrement_version(max_minor, max_patch, current_version): + patch + 1 ) +def test_format_kwargs_to_string(): + kwargs = { + "enforce_sizes": [1, 32, 4, 8], + "batch_size": 1, + "model_id": "granite-3.3-8b", + "seq_len": 64, + } + kwargs_str = format_kwargs_to_string(**kwargs) + assert ( + kwargs_str + == "batch-size-1_enforce-sizes-1,32,4,8_model-id-granite-3.3-8b_seq-len-64" + ) + + +DATASET_PATH = os.getenv( + "DATASET_PATH", "/mnt/home/models/ShareGPT_V3_unfiltered_cleaned_split.json" +) +TOKENIZER = os.getenv("TOKENIZER", "ibm-granite/granite-3.3-8b-Instruct") + + +@pytest.mark.parametrize( + "model_variant,max_new_tokens,batch_size,seq_length,dtype,attn_type,device_type,seed,aftu_version", + [("granite-3.3-8b", 64, 2, 64, "fp16", "spda", "cpu", 0, (1, 2, 3))], +) +def test_get_default_validation_prefix( + model_variant, + max_new_tokens, + batch_size, + seq_length, + dtype, + attn_type, + device_type, + seed, + aftu_version, +): + tokenizer = AutoTokenizer.from_pretrained(TOKENIZER) + + sample_key = None + # get_default_validation_prefix with sample_key set to None + prefix_sample_key_none = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" + + assert ( + prefix_sample_key_none + == f"{model_variant}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}.1.2.3.cpu_validation_info.0.out" + ) + + # get_default_validation_prefix with no kwargs using legacy case + legacy_prefix = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]))}.{device_type}_validation_info.{seed}.out" + assert prefix_sample_key_none == legacy_prefix + + # retrieve a sample_key with return_key is True + dataset_1, sample_key = sample_sharegpt_requests( + DATASET_PATH, + batch_size, + tokenizer, + 32, + seq_length * 2, + seed=seed, + enforce_sizes=[], + return_key=True, + ) + prefix_with_sample_key = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" + + # Check sample key sorted by parameter name + assert sample_key.split("_") == sorted(sample_key.split("_")) + # Check sample key included in name as expected + assert "sample-key-" + sample_key in prefix_with_sample_key + + dataset_2 = sample_sharegpt_requests( + DATASET_PATH, + batch_size, + tokenizer, + 32, + seq_length * 2, + seed=seed, + enforce_sizes=[], + ) + + assert dataset_1 == dataset_2 From 6d6541f0b6d2c1eff9450210b0b042614a1a2a0b Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Mon, 29 Sep 2025 15:23:44 +0000 Subject: [PATCH 03/22] [utils] added doc string, refactor sample_key, added return_key to squad_v2 sampler Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- aiu_fms_testing_utils/testing/utils.py | 3 ++ aiu_fms_testing_utils/utils/__init__.py | 71 +++++++++++++++---------- 2 files changed, 47 insertions(+), 27 deletions(-) diff --git a/aiu_fms_testing_utils/testing/utils.py b/aiu_fms_testing_utils/testing/utils.py index 79ae564f..72fd30b2 100644 --- a/aiu_fms_testing_utils/testing/utils.py +++ b/aiu_fms_testing_utils/testing/utils.py @@ -2,6 +2,9 @@ def format_kwargs_to_string(**kwargs): + """ + Turns kwargs into a str with variable names using `-`, variables separated by `_` and iterable separated by `,` + """ formatted_pairs = [] for key, value in sorted(kwargs.items()): formatted_value = None diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 796567ee..6615c5c9 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -508,20 +508,20 @@ def sample_rag_factoid_requests( _cached_dataset_key=dataset_path, ) - sample_key: str = format_kwargs_to_string( - dataset="rag_factoid", - num_requests=num_requests, - tokenizer=tokenizer.name_or_path.replace("/", "--"), - prompt_length_min=prompt_length_min, - prompt_length_max=prompt_length_max, - seed=seed, - enforce_heterogeneous=enforce_heterogeneous, - enforce_sizes=enforce_sizes, - truncate=truncation, - pad_multiple=pad_multiple, - ) - if return_key: + sample_key: str = format_kwargs_to_string( + dataset="rag_factoid", + num_requests=num_requests, + tokenizer=tokenizer.name_or_path.replace("/", "--"), + prompt_length_min=prompt_length_min, + prompt_length_max=prompt_length_max, + seed=seed, + enforce_heterogeneous=enforce_heterogeneous, + enforce_sizes=enforce_sizes, + truncate=truncation, + pad_multiple=pad_multiple, + ) + return sample_request, sample_key else: return sample_request @@ -578,20 +578,19 @@ def sample_sharegpt_requests( _cached_dataset_key=dataset_path, ) - sample_key: str = format_kwargs_to_string( - dataset="sharegpt", - num_requests=num_requests, - tokenizer=tokenizer.name_or_path.replace("/", "--"), - prompt_length_min=prompt_length_min, - prompt_length_max=prompt_length_max, - seed=seed, - enforce_heterogeneous=enforce_heterogeneous, - enforce_sizes=enforce_sizes, - truncate=truncation, - pad_multiple=pad_multiple, - ) - if return_key: + sample_key: str = format_kwargs_to_string( + dataset="sharegpt", + num_requests=num_requests, + tokenizer=tokenizer.name_or_path.replace("/", "--"), + prompt_length_min=prompt_length_min, + prompt_length_max=prompt_length_max, + seed=seed, + enforce_heterogeneous=enforce_heterogeneous, + enforce_sizes=enforce_sizes, + truncate=truncation, + pad_multiple=pad_multiple, + ) return sample_request, sample_key else: return sample_request @@ -608,6 +607,7 @@ def sample_squad_v2_qa_requests( enforce_sizes: List[int] | None = None, truncation: bool = False, pad_multiple: int = 64, + return_key: bool = False, ) -> List[Tuple[str, int]]: from datasets import load_dataset @@ -621,7 +621,7 @@ def sample_squad_v2_qa_requests( ds = [f"{data['context']}\n{data['question']}" for data in ds] - return __sample_requests( + sample_request = __sample_requests( ds, num_requests, tokenizer, @@ -634,6 +634,23 @@ def sample_squad_v2_qa_requests( pad_multiple, ) + if return_key: + sample_key: str = format_kwargs_to_string( + dataset="squad_v2", + num_requests=num_requests, + tokenizer=tokenizer.name_or_path.replace("/", "--"), + prompt_length_min=prompt_length_min, + prompt_length_max=prompt_length_max, + seed=seed, + enforce_heterogeneous=enforce_heterogeneous, + enforce_sizes=enforce_sizes, + truncate=truncation, + pad_multiple=pad_multiple, + ) + return sample_request, sample_key + else: + return sample_request + def prepare_inputs( batch_size, seq_length, tokenizer, ds_path, seed=0, ds_type="sharegpt" From 97aeea3a3bc66edd717080e9f21874c32c854ed0 Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:12:19 +0000 Subject: [PATCH 04/22] [dpp/validation] restore sample_key in logic after rebase of main Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- aiu_fms_testing_utils/testing/validation.py | 2 ++ scripts/drive_paged_programs.py | 12 +++++++++--- tests/models/test_decoders.py | 8 +++++++- tests/testing/test_validation.py | 16 ++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py index 5749cb89..f62c7def 100644 --- a/aiu_fms_testing_utils/testing/validation.py +++ b/aiu_fms_testing_utils/testing/validation.py @@ -468,6 +468,7 @@ def find_validation_info_path( Find the validation info path if it exists, otherwise return None """ enforce_sizes = kwargs.get("enforce_sizes", None) + sample_key = kwargs.get("sample_key", None) if aftu_version is None: loc_version_tuple = version_tuple[:3] @@ -489,6 +490,7 @@ def find_validation_info_path( device_type, dtype, enforce_sizes=enforce_sizes, + sample_key=sample_key, ) # if the path is found, we are done searching and can return if os.path.exists(full_path): diff --git a/scripts/drive_paged_programs.py b/scripts/drive_paged_programs.py index 469e3ae0..e3cb7ca6 100644 --- a/scripts/drive_paged_programs.py +++ b/scripts/drive_paged_programs.py @@ -297,7 +297,9 @@ def __load_validation_info( tokenizer, seed, attn_type: str, + **kwargs, ): + sample_key = kwargs.get("sample_key", None) full_path = find_validation_info_path( args.validation_info_outputs_dir, model_variant, @@ -308,6 +310,7 @@ def __load_validation_info( attn_type, version_allow_decrement=True, dtype=CPU_DTYPE, + sample_key=sample_key, ) if full_path is not None: dprint(f"cpu validation info found for seed={seed} -- loading it") @@ -496,7 +499,7 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]: for valid_prompt_shape in valid_prompt_shapes: if valid_prompt_shape == custom_shape: enforce_sizes = [valid_prompt_shape[1]] - input_ids, extra_kwargs = __prepare_inputs( + input_ids, extra_kwargs, sample_key = __prepare_inputs( valid_prompt_shape[0], valid_prompt_shape[1], tokenizer, @@ -508,6 +511,7 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]: custom_shape, input_ids, extra_kwargs, + sample_key, ) ] break @@ -568,7 +572,7 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]: ) ) try: - input_ids, extra_kwargs = __prepare_inputs( + input_ids, extra_kwargs, sample_key = __prepare_inputs( valid_prompt_shape[0], valid_prompt_shape[1], tokenizer, @@ -580,6 +584,7 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]: valid_prompt_shape, input_ids, extra_kwargs, + sample_key, ) ) used_keys.add(program_seq_key[0]) @@ -611,7 +616,7 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor): failed_cases = [] # for each program and valid prompt (batch size, sequence length) -for program_id, valid_prompt, input_ids, extra_kwargs in valid_prompts: +for program_id, valid_prompt, input_ids, extra_kwargs, sample_key in valid_prompts: extra_kwargs["attn_name"] = ATTN_NAME if ( "granite-3.3-8b-instruct" in model_variant @@ -636,6 +641,7 @@ def __metric_calculator(r: torch.Tensor, t: torch.Tensor): tokenizer, seed=0, attn_type=ATTN_NAME, + sample_key=sample_key, ) # if the cpu validation info is not yet computed, compute it if cpu_validation_info is None: diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index 122c9664..d257c44c 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -364,7 +364,13 @@ def __filter_before_eos(metrics, filter_indexes): def __load_validation_info( - model_path, batch_size, seq_length, max_new_tokens, tokenizer, seed, attn_type: str + model_path, + batch_size, + seq_length, + max_new_tokens, + tokenizer, + seed, + attn_type: str, ): # if path doesn't exist and paged isn't in the attention name, remove `attn_type` and recheck again, warn that we will no longer in the future have paths without 'attn_type' full_path = find_validation_info_path( diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py index b02bd19c..19c11b07 100644 --- a/tests/testing/test_validation.py +++ b/tests/testing/test_validation.py @@ -99,6 +99,20 @@ def test_get_validation_info_path(tmp_path): == f"{tmp_path}/ibm-granite--granite-3.3-8b-instruct_max-new-tokens-128_batch-size-4_seq-length-64_dtype-fp16_attn-type-sdpa.1.2.3.cpu_validation_info.0.out" ) + # Check that it is accepting kwargs and handling sample_key + dummy_sample_key = "dataset-sharegpt_num-requests-4_pad-multiple-64_prompt-length-max-128_prompt-length-min-32_tokenizer-ibm-granite--granite-3.3-8b-Instruct" + assert "sample_key" and "dataset" in get_validation_info_path( + tmp_path, + "ibm-granite/granite-3.3-8b-instruct", + 4, + 64, + 128, + 0, + "sdpa", + aftu_version=(1, 2, 3), + sample_key=dummy_sample_key, + ) + @pytest.mark.parametrize( "current_version,save_version,expected_version,version_allow_decrement", @@ -244,6 +258,8 @@ def test_decrement_version(max_minor, max_patch, current_version): + patch + 1 ) + + def test_format_kwargs_to_string(): kwargs = { "enforce_sizes": [1, 32, 4, 8], From 9efb6e74d608e4bdc26a8192487829fc4640aed6 Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Fri, 3 Oct 2025 17:56:49 +0000 Subject: [PATCH 05/22] [validation] Modified final file string to hash due to OSError name too long Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- aiu_fms_testing_utils/testing/validation.py | 11 ++++-- tests/testing/test_validation.py | 38 +++++++++------------ 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py index f62c7def..072c9c0a 100644 --- a/aiu_fms_testing_utils/testing/validation.py +++ b/aiu_fms_testing_utils/testing/validation.py @@ -7,6 +7,8 @@ import os from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string +import hashlib + class LogitsExtractorHook( Callable[ @@ -146,14 +148,17 @@ def get_default_validation_prefix( aftu_version (str): introduced in v0.3.0 to track changed in log Returns: - str: A prefix that will be prepended to the file name + str: A hashed prefix that will be prepended to the file name """ kwargs_str = format_kwargs_to_string(**kwargs) if kwargs_str == "": - return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}.{aftu_version}" + filename = f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}" else: - return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}_{kwargs_str}.{aftu_version}" + filename = f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}_{kwargs_str}" + hash_object = hashlib.sha256(filename.encode("utf-8")) + hex_digest = hash_object.hexdigest() + return f"{hex_digest}_{aftu_version}" def load_validation_information( diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py index 19c11b07..74f6403c 100644 --- a/tests/testing/test_validation.py +++ b/tests/testing/test_validation.py @@ -10,6 +10,7 @@ __decrement_version, get_default_validation_prefix, ) +import hashlib import os from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string from aiu_fms_testing_utils.utils import sample_sharegpt_requests @@ -79,12 +80,21 @@ def test_validation_info_round_trip(validation_type, post_iteration_hook): def test_get_validation_info_path(tmp_path): + check_pathname = "ibm-granite--granite-3.3-8b-instruct_max-new-tokens-128_batch-size-4_seq-length-64_dtype-fp16_attn-type-sdpa" + hash_object = hashlib.sha256(check_pathname.encode("utf-8")) + hex_digest = hash_object.hexdigest() + assert ( get_validation_info_path( tmp_path, "ibm-granite/granite-3.3-8b-instruct", 4, 64, 128, 0, "sdpa" ) - == f"{tmp_path}/ibm-granite--granite-3.3-8b-instruct_max-new-tokens-128_batch-size-4_seq-length-64_dtype-fp16_attn-type-sdpa.{'.'.join([str(_) for _ in version_tuple[:3]])}.cpu_validation_info.0.out" + == f"{tmp_path}/{hex_digest}_{'.'.join([str(_) for _ in version_tuple[:3]])}.cpu_validation_info.0.out" ) + + check_pathname = "ibm-granite--granite-3.3-8b-instruct_max-new-tokens-128_batch-size-4_seq-length-64_dtype-fp16_attn-type-sdpa" + hash_object = hashlib.sha256(check_pathname.encode("utf-8")) + hex_digest = hash_object.hexdigest() + assert ( get_validation_info_path( tmp_path, @@ -96,21 +106,7 @@ def test_get_validation_info_path(tmp_path): "sdpa", aftu_version=(1, 2, 3), ) - == f"{tmp_path}/ibm-granite--granite-3.3-8b-instruct_max-new-tokens-128_batch-size-4_seq-length-64_dtype-fp16_attn-type-sdpa.1.2.3.cpu_validation_info.0.out" - ) - - # Check that it is accepting kwargs and handling sample_key - dummy_sample_key = "dataset-sharegpt_num-requests-4_pad-multiple-64_prompt-length-max-128_prompt-length-min-32_tokenizer-ibm-granite--granite-3.3-8b-Instruct" - assert "sample_key" and "dataset" in get_validation_info_path( - tmp_path, - "ibm-granite/granite-3.3-8b-instruct", - 4, - 64, - 128, - 0, - "sdpa", - aftu_version=(1, 2, 3), - sample_key=dummy_sample_key, + == f"{tmp_path}/{hex_digest}_1.2.3.cpu_validation_info.0.out" ) @@ -299,12 +295,12 @@ def test_get_default_validation_prefix( sample_key = None # get_default_validation_prefix with sample_key set to None + check_prefix_sample_key_none = f"{model_variant}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}" + hash_object = hashlib.sha256(check_prefix_sample_key_none.encode("utf-8")) + hex_digest = hash_object.hexdigest() prefix_sample_key_none = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" - assert ( - prefix_sample_key_none - == f"{model_variant}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}.1.2.3.cpu_validation_info.0.out" - ) + assert prefix_sample_key_none == f"{hex_digest}_1.2.3.cpu_validation_info.0.out" # get_default_validation_prefix with no kwargs using legacy case legacy_prefix = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]))}.{device_type}_validation_info.{seed}.out" @@ -325,8 +321,6 @@ def test_get_default_validation_prefix( # Check sample key sorted by parameter name assert sample_key.split("_") == sorted(sample_key.split("_")) - # Check sample key included in name as expected - assert "sample-key-" + sample_key in prefix_with_sample_key dataset_2 = sample_sharegpt_requests( DATASET_PATH, From ea1a8aca91aa72394503efd4af82f9096185114c Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Fri, 3 Oct 2025 18:11:01 +0000 Subject: [PATCH 06/22] [test_validation] remove unused line Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- tests/testing/test_validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py index 74f6403c..220f89e9 100644 --- a/tests/testing/test_validation.py +++ b/tests/testing/test_validation.py @@ -317,7 +317,6 @@ def test_get_default_validation_prefix( enforce_sizes=[], return_key=True, ) - prefix_with_sample_key = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" # Check sample key sorted by parameter name assert sample_key.split("_") == sorted(sample_key.split("_")) From b637ea502409145824a02aa3841aea8de7fa84a7 Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Fri, 3 Oct 2025 19:11:04 +0000 Subject: [PATCH 07/22] [validation] removed enforce_sizes from find_validation_info_path Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- aiu_fms_testing_utils/testing/validation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py index 072c9c0a..ad1b5906 100644 --- a/aiu_fms_testing_utils/testing/validation.py +++ b/aiu_fms_testing_utils/testing/validation.py @@ -472,7 +472,6 @@ def find_validation_info_path( """ Find the validation info path if it exists, otherwise return None """ - enforce_sizes = kwargs.get("enforce_sizes", None) sample_key = kwargs.get("sample_key", None) if aftu_version is None: @@ -494,7 +493,6 @@ def find_validation_info_path( loc_version_tuple, device_type, dtype, - enforce_sizes=enforce_sizes, sample_key=sample_key, ) # if the path is found, we are done searching and can return From aef361ea630348c6ece5e686e9238944239e86d3 Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Tue, 7 Oct 2025 01:18:06 +0000 Subject: [PATCH 08/22] [dpp] added handling of return_key for __custom_line_sampler Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- scripts/drive_paged_programs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/drive_paged_programs.py b/scripts/drive_paged_programs.py index e3cb7ca6..033a8efe 100644 --- a/scripts/drive_paged_programs.py +++ b/scripts/drive_paged_programs.py @@ -40,6 +40,7 @@ get_programs_prompts, KVCACHE_NUM_BLOCKS_HINT, ) +from aiu_fms_testing_utils.testing.utils import format_kwargs_to_string parser = argparse.ArgumentParser( description="Script which will drive paged programs for debugging" @@ -195,6 +196,10 @@ custom_shape = (len(result), max([_[1] for _ in result])) def __custom_line_sampler(*args, **kwargs): + return_key = kwargs.get("return_key", False) + sample_key = format_kwargs_to_string(**kwargs) + if return_key: + return result, sample_key return result sampler = __custom_line_sampler From 178b75b1886d71fa17a9fc17217a358f151ef8b9 Mon Sep 17 00:00:00 2001 From: Joshua Rosenkranz Date: Tue, 7 Oct 2025 14:45:11 +0000 Subject: [PATCH 09/22] updated llama model expectation tests using v1.0.0 aiu software stack as modeling code changed Signed-off-by: Joshua Rosenkranz --- ...TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_output | 2 +- ...IUDecoderModels.Llama-3.1-8B-Instruct.test_model_weight_keys | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_output b/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_output index 098709e3..bfbcd6b1 100644 --- a/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_output +++ b/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_output @@ -1 +1 @@ -9.625,9.625,9.6875,9.625,10.53125,37.375,8.65625,14.90625,1.03125,5.875,15.6875,6.0625,9.5,17.5625,37.0,10.34375,6.25,13.125,3.8125,9.21875,21.96875,14.28125,0.0,13.09375,7.6875,6.4375,19.09375,10.6875,23.9375,13.0,11.84375,46.4375,6.59375,0.0,13.0,23.125,16.34375,3.125,12.65625,6.03125,14.375,6.84375,14.9375,20.9375,5.625,37.0,4.875,3.25,7.40625,2.6875,18.9375,4.1875,13.5,8.4375,21.1875,13.21875,35.25,21.78125,8.3125,4.75,12.0625,3.90625,9.34375,4.25 \ No newline at end of file +0.18359375,0.18359375,0.181640625,0.189453125,0.2734375,0.544921875,0.607421875,0.365234375,0.30078125,0.25,0.078125,0.302734375,0.0,0.322265625,0.142578125,0.099609375,0.296875,0.28125,0.673828125,0.44921875,0.13671875,0.42578125,1.072265625,0.18359375,0.388671875,0.177734375,0.193359375,0.296875,0.484375,0.3515625,0.826171875,0.349609375,0.296875,0.720703125,0.634765625,0.607421875,0.14453125,0.29296875,0.154296875,0.287109375,0.482421875,0.2421875,0.48046875,0.203125,0.349609375,0.21484375,0.28515625,0.17578125,0.162109375,0.3203125,0.3125,0.54296875,0.287109375,0.361328125,0.390625,0.08984375,0.2109375,0.5,0.18359375,0.228515625,0.314453125,0.291015625,0.248046875,0.5078125 \ No newline at end of file diff --git a/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_weight_keys b/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_weight_keys index 6329cb98..3fcc470f 100644 --- a/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_weight_keys +++ b/tests/resources/expectations/models.test_model_expectations.TestAIUDecoderModels.Llama-3.1-8B-Instruct.test_model_weight_keys @@ -1 +1 @@ -dec_norm.weight,layers.0.attn.dense.weight,layers.0.attn.in_proj.key.weight,layers.0.attn.in_proj.query.weight,layers.0.attn.in_proj.value.weight,layers.0.ff_ln.weight,layers.0.ff_sub_layer.w1.weight,layers.0.ff_sub_layer.w2.weight,layers.0.ff_sub_layer.wg.weight,layers.0.ln.weight,layers.1.attn.dense.weight,layers.1.attn.in_proj.key.weight,layers.1.attn.in_proj.query.weight,layers.1.attn.in_proj.value.weight,layers.1.ff_ln.weight,layers.1.ff_sub_layer.w1.weight,layers.1.ff_sub_layer.w2.weight,layers.1.ff_sub_layer.wg.weight,layers.1.ln.weight,layers.2.attn.dense.weight,layers.2.attn.in_proj.key.weight,layers.2.attn.in_proj.query.weight,layers.2.attn.in_proj.value.weight,layers.2.ff_ln.weight,layers.2.ff_sub_layer.w1.weight,layers.2.ff_sub_layer.w2.weight,layers.2.ff_sub_layer.wg.weight,layers.2.ln.weight,shared.emb.weight,shared.head.weight \ No newline at end of file +base_model.dec_norm.weight,base_model.embedding.weight,base_model.layers.0.attn.dense.weight,base_model.layers.0.attn.in_proj.key.weight,base_model.layers.0.attn.in_proj.query.weight,base_model.layers.0.attn.in_proj.value.weight,base_model.layers.0.ff_ln.weight,base_model.layers.0.ff_sub_layer.w1.weight,base_model.layers.0.ff_sub_layer.w2.weight,base_model.layers.0.ff_sub_layer.wg.weight,base_model.layers.0.ln.weight,base_model.layers.1.attn.dense.weight,base_model.layers.1.attn.in_proj.key.weight,base_model.layers.1.attn.in_proj.query.weight,base_model.layers.1.attn.in_proj.value.weight,base_model.layers.1.ff_ln.weight,base_model.layers.1.ff_sub_layer.w1.weight,base_model.layers.1.ff_sub_layer.w2.weight,base_model.layers.1.ff_sub_layer.wg.weight,base_model.layers.1.ln.weight,base_model.layers.2.attn.dense.weight,base_model.layers.2.attn.in_proj.key.weight,base_model.layers.2.attn.in_proj.query.weight,base_model.layers.2.attn.in_proj.value.weight,base_model.layers.2.ff_ln.weight,base_model.layers.2.ff_sub_layer.w1.weight,base_model.layers.2.ff_sub_layer.w2.weight,base_model.layers.2.ff_sub_layer.wg.weight,base_model.layers.2.ln.weight,head.weight \ No newline at end of file From 73f6551d01e1562530798ac025c34ca2fab45dab Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Tue, 30 Sep 2025 15:07:14 +0000 Subject: [PATCH 10/22] [testing] changed get_default_validation_prefix to generic kwargs, file names now sorted, testing of file names modified for new order Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- aiu_fms_testing_utils/testing/utils.py | 4 +++- aiu_fms_testing_utils/testing/validation.py | 17 +++++------------ scripts/generate_layers_metrics.py | 6 +++++- scripts/generate_metrics.py | 10 +++++----- tests/testing/test_validation.py | 10 +++++----- 5 files changed, 23 insertions(+), 24 deletions(-) diff --git a/aiu_fms_testing_utils/testing/utils.py b/aiu_fms_testing_utils/testing/utils.py index 72fd30b2..cacff899 100644 --- a/aiu_fms_testing_utils/testing/utils.py +++ b/aiu_fms_testing_utils/testing/utils.py @@ -17,6 +17,8 @@ def format_kwargs_to_string(**kwargs): # only append if formatted_value exists if formatted_value: # Keep previous convention of variable names with `-` instead of `_` - formatted_pairs.append(f"{key.replace('_', '-')}-{formatted_value}") + formatted_pairs.append( + f"{key.replace('_', '-')}-{formatted_value.replace('/', '--')}" + ) return "_".join(formatted_pairs) diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py index ad1b5906..5bf120a0 100644 --- a/aiu_fms_testing_utils/testing/validation.py +++ b/aiu_fms_testing_utils/testing/validation.py @@ -128,13 +128,6 @@ def __len__(self): def get_default_validation_prefix( - model_id: str, - max_new_tokens: int, - batch_size: int, - seq_length: int, - dtype: str, - attn_type: str, - aftu_version: str, **kwargs, ): """ @@ -150,12 +143,12 @@ def get_default_validation_prefix( Returns: str: A hashed prefix that will be prepended to the file name """ + aftu_version = kwargs.pop( + "aftu_version", ".".join([str(_) for _ in version_tuple[:3]]) + ) kwargs_str = format_kwargs_to_string(**kwargs) - if kwargs_str == "": - filename = f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}" - else: - filename = f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}_{kwargs_str}" + filename = f"{kwargs_str}" hash_object = hashlib.sha256(filename.encode("utf-8")) hex_digest = hash_object.hexdigest() return f"{hex_digest}_{aftu_version}" @@ -435,7 +428,7 @@ def get_validation_info_path( sample_key = kwargs.get("sample_key", None) - validation_file_name = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" + validation_file_name = f"{get_default_validation_prefix(aftu_version='.'.join([str(_) for _ in aftu_version[:3]]), model_id=model_variant, max_new_tokens=max_new_tokens, batch_size=batch_size, seq_length=seq_length, dtype=dtype, attn_type=attn_type, sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" full_path = os.path.join(validation_info_dir, validation_file_name) return full_path diff --git a/scripts/generate_layers_metrics.py b/scripts/generate_layers_metrics.py index d3245123..ffc01930 100644 --- a/scripts/generate_layers_metrics.py +++ b/scripts/generate_layers_metrics.py @@ -473,7 +473,11 @@ def generate_layers_metrics(model_path, batch_size, seq_length, max_new_tokens): cos_sim = tensor_cos_sim(tensor_cpu_out, cuda_output) prefix = get_default_validation_prefix( - model_path, max_new_token, batch_size, seq_length, "float16" + model_id=model_path, + max_new_tokens=max_new_token, + batch_size=batch_size, + seq_length=seq_length, + dtype="float16", ) layer_name = str(layer_key).replace("[", "").replace("]", "") diff --git a/scripts/generate_metrics.py b/scripts/generate_metrics.py index 8ec3f028..f65149fa 100644 --- a/scripts/generate_metrics.py +++ b/scripts/generate_metrics.py @@ -134,11 +134,11 @@ # this follows the same pattern of naming in test_shapes. This way we can save and re-use for quicker shape testing. prefix = get_default_validation_prefix( - args.variant, - args.max_new_tokens, - args.batch_size, - args.min_pad_length, - args.default_dtype, + model_id=args.variant, + max_new_tokens=args.max_new_tokens, + batch_size=args.batch_size, + seq_len=args.min_pad_length, + dtype=args.default_dtype, ) if os.path.exists(os.path.join(args.output_dir, f"{prefix}.prob_mean.csv")): print("skipping metric generation as it has already been done") diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py index 220f89e9..95f2ff4e 100644 --- a/tests/testing/test_validation.py +++ b/tests/testing/test_validation.py @@ -80,7 +80,7 @@ def test_validation_info_round_trip(validation_type, post_iteration_hook): def test_get_validation_info_path(tmp_path): - check_pathname = "ibm-granite--granite-3.3-8b-instruct_max-new-tokens-128_batch-size-4_seq-length-64_dtype-fp16_attn-type-sdpa" + check_pathname = "attn-type-sdpa_batch-size-4_dtype-fp16_max-new-tokens-128_model-id-ibm-granite--granite-3.3-8b-instruct_seq-length-64" hash_object = hashlib.sha256(check_pathname.encode("utf-8")) hex_digest = hash_object.hexdigest() @@ -91,7 +91,7 @@ def test_get_validation_info_path(tmp_path): == f"{tmp_path}/{hex_digest}_{'.'.join([str(_) for _ in version_tuple[:3]])}.cpu_validation_info.0.out" ) - check_pathname = "ibm-granite--granite-3.3-8b-instruct_max-new-tokens-128_batch-size-4_seq-length-64_dtype-fp16_attn-type-sdpa" + check_pathname = "attn-type-sdpa_batch-size-4_dtype-fp16_max-new-tokens-128_model-id-ibm-granite--granite-3.3-8b-instruct_seq-length-64" hash_object = hashlib.sha256(check_pathname.encode("utf-8")) hex_digest = hash_object.hexdigest() @@ -295,15 +295,15 @@ def test_get_default_validation_prefix( sample_key = None # get_default_validation_prefix with sample_key set to None - check_prefix_sample_key_none = f"{model_variant}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}_attn-type-{attn_type}" + check_prefix_sample_key_none = f"attn-type-{attn_type}_batch-size-{batch_size}_dtype-{dtype}_max-new-tokens-{max_new_tokens}_model-id-{model_variant}_seq-length-{seq_length}" hash_object = hashlib.sha256(check_prefix_sample_key_none.encode("utf-8")) hex_digest = hash_object.hexdigest() - prefix_sample_key_none = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" + prefix_sample_key_none = f"{get_default_validation_prefix(model_id=model_variant, max_new_tokens=max_new_tokens, batch_size=batch_size, seq_length=seq_length, dtype=dtype, attn_type=attn_type, aftu_version='.'.join([str(_) for _ in aftu_version[:3]]), sample_key=sample_key)}.{device_type}_validation_info.{seed}.out" assert prefix_sample_key_none == f"{hex_digest}_1.2.3.cpu_validation_info.0.out" # get_default_validation_prefix with no kwargs using legacy case - legacy_prefix = f"{get_default_validation_prefix(model_variant, max_new_tokens, batch_size, seq_length, dtype, attn_type, '.'.join([str(_) for _ in aftu_version[:3]]))}.{device_type}_validation_info.{seed}.out" + legacy_prefix = f"{get_default_validation_prefix(model_id=model_variant, max_new_tokens=max_new_tokens, batch_size=batch_size, seq_length=seq_length, dtype=dtype, attn_type=attn_type, aftu_version='.'.join([str(_) for _ in aftu_version[:3]]))}.{device_type}_validation_info.{seed}.out" assert prefix_sample_key_none == legacy_prefix # retrieve a sample_key with return_key is True From 86d60d677d0f05422e5a5f3f5e12845db79e68cf Mon Sep 17 00:00:00 2001 From: Joshua Rosenkranz Date: Wed, 8 Oct 2025 03:28:29 +0000 Subject: [PATCH 11/22] fixed test_scripts program assertion Signed-off-by: Joshua Rosenkranz --- tests/models/test_scripts.py | 45 ++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tests/models/test_scripts.py b/tests/models/test_scripts.py index 79bd9952..21b47fb4 100644 --- a/tests/models/test_scripts.py +++ b/tests/models/test_scripts.py @@ -6,6 +6,7 @@ from pathlib import Path import itertools import math +from aiu_fms_testing_utils.utils.paged import get_programs_prompts, ProgramCriteria FMS_DIR = Path(__file__).parent AIU_FMS_DIR = os.path.join(FMS_DIR, "../../../aiu-fms-testing-utils/") @@ -291,28 +292,48 @@ def test_dpp_script( ) print(result_text) with open(os.environ["DT_PROG_CRITERIA_FILEPATH"], "r") as f: - program_criteria_list = json.load(f)["programs"] + program_criteria_json_list = json.load(f)["programs"] + program_criteria_list = [] + for i, d in enumerate(program_criteria_json_list): + program_criteria_list.append( + ProgramCriteria( + i, + d["max_batch"], + d["max_tkv"], + d["batch_granularity"], + d["tkv_granularity"], + ) + ) if programs is None: program_assertions = [i for i in range(len(program_criteria_list))] shape_assertions = [">=0", ">=0"] else: + program_map = get_programs_prompts( + program_criteria_list, + multiple=64, + max_batch_size=2, + max_tkv=512, + program_cycles=max_new_tokens, + ) programs_split = programs.split(":") program_ids_str = programs_split[0] shape_assertions = [ f">={_}" if _.isnumeric() else _ for _ in programs_split[1].split(",") ] - match_number = r"\d+" - valid_program_assertions = [ - f">={re.search(match_number, _).group()}" for _ in shape_assertions - ] - # need to add 1 for tkv as that is the first decode - program_assertions = [ - i - for i, p in enumerate(program_criteria_list) - if eval(f"p['max_batch']{valid_program_assertions[0]}") - and eval(f"p['max_tkv']{valid_program_assertions[1]}+1") - ] + + program_assertions = [] + for program_id_seq, shapes in program_map.items(): + if any( + ( + eval( + f"shape[0]{shape_assertions[0]} and shape[1]{shape_assertions[1]}" + ) + for shape in shapes + ) + ): + program_assertions.append(program_id_seq[0].program_id) + if program_ids_str == "?": program_assertions = program_assertions[:1] elif program_ids_str.isnumeric(): From f3dcee101b8f90396d06b7fcbfe9fed9c15c5f23 Mon Sep 17 00:00:00 2001 From: Avery Blanchard Date: Tue, 8 Apr 2025 18:08:44 +0000 Subject: [PATCH 12/22] Add test case for caching Signed-off-by: Avery Blanchard Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 59 +++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index d257c44c..3d6c9d94 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -188,6 +188,8 @@ os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(max(common_batch_sizes), 2)) fx_config.backed_size_oblivious = True +cache_params = list(itertools.product([common_model_paths[0]], [common_batch_sizes[0]], [common_seq_lengths[0]], [common_max_new_tokens[0]], ["miss", "hit"])) + # thresholds are chosen based on 1024 tokens per sequence # 1% error threshold rate between cpu fp32 and cuda fp16 # if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above @@ -259,7 +261,7 @@ def reset_compiler(): torch.compiler.reset() torch._dynamo.reset() os.environ.pop("COMPILATION_MODE", None) - + os.environ.pop('TORCH_SENDNN_CACHE_ENABLE', None) # TODO: Currently, gptq does not have the same level of support as non-gptq models for get_model. This method provides the extra requirements for gptq for get_model, # however ideally, these fixes should be done in foundation-model-stack. @@ -308,7 +310,6 @@ def __maybe_get_gptq_kwargs(model_path): pass return gptq_kwargs_aiu, gptq_kwargs_cpu - def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): if "paged" in ATTN_NAME: prompts_and_sizes = sample_sharegpt_requests( @@ -774,3 +775,57 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): print("passed validation level 1") else: print("passed validation level 0") + +@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,cache_status", cache_params) +def test_cache(model_path, batch_size, seq_length, max_new_tokens, cache_status): + torch.manual_seed(42) + os.environ["TORCH_SENDNN_CACHE_ENABLE"] = "1" + os.environ["COMPILATION_MODE"] = "offline_decoder" + + dprint(f"testing with cache: model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, cache={cache_status}") + + if USE_MICRO_MODELS: + micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3} + else: + micro_model_kwargs = {"architecture": "hf_pretrained"} + + if not USE_MICRO_MODELS and os.path.exists(model_path): + model_path_kwargs = {"model_path": model_path} + else: + model_path_kwargs = {"variant": model_path} + + distributed_kwargs = {} + if USE_DISTRIBUTED: + distributed_kwargs["distr_param"] = "tp" + distributed_kwargs["group"] = dist.group.WORLD + get_model_kwargs = {**model_path_kwargs, **micro_model_kwargs, **distributed_kwargs} + + tokenizer = tokenizers.get_tokenizer(model_path) + + # prepare the AIU model + model = get_model( + device_type="cpu", + fused_weights=False, + **get_model_kwargs + ) + + model.eval() + torch.set_grad_enabled(False) + model.compile(backend="sendnn_decoder") + + + # prepare input_ids + input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) + + # warmup aiu model + warmup_model(model, input_ids, max_new_tokens, **padding_kwargs) + + # aiu validatation + aiu_validation_info = extract_validation_information( + model, + input_ids, + max_new_tokens, + None, + only_last_token=True, + **padding_kwargs +) From f936eb594fb95a26afd138c6e13f97a8a71cea7b Mon Sep 17 00:00:00 2001 From: Avery Blanchard Date: Sat, 19 Jul 2025 00:18:38 +0000 Subject: [PATCH 13/22] Update cache test, add validation for cached run Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 265 ++++++++++++++++++++++++++++++---- 1 file changed, 240 insertions(+), 25 deletions(-) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index 3d6c9d94..adbeae50 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -29,7 +29,7 @@ from transformers import AutoTokenizer from aiu_fms_testing_utils.utils.aiu_setup import dprint, aiu_dist_setup - +import shutil import os try: @@ -188,7 +188,6 @@ os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(max(common_batch_sizes), 2)) fx_config.backed_size_oblivious = True -cache_params = list(itertools.product([common_model_paths[0]], [common_batch_sizes[0]], [common_seq_lengths[0]], [common_max_new_tokens[0]], ["miss", "hit"])) # thresholds are chosen based on 1024 tokens per sequence # 1% error threshold rate between cpu fp32 and cuda fp16 @@ -776,56 +775,272 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): else: print("passed validation level 0") -@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,cache_status", cache_params) -def test_cache(model_path, batch_size, seq_length, max_new_tokens, cache_status): +@pytest.mark.parametrize("cache_status", ["miss", "hit"]) +def test_cache(cache_status): torch.manual_seed(42) + torch.set_grad_enabled(False) os.environ["TORCH_SENDNN_CACHE_ENABLE"] = "1" + os.environ["TORCH_SENDNN_CACHE_DIR"] = os.getcwd()+"/.cache" os.environ["COMPILATION_MODE"] = "offline_decoder" + if cache_status == "miss" and os.path.isdir(os.getcwd()+"/.cache"): + # Remove cache from previous runs + shutil.rmtree(os.getcwd()+"/.cache") + + model_path = "ibm-granite/granite-3.3-8b-instruct" + batch_size = common_batch_sizes[0] + seq_length = common_seq_lengths[0] + max_new_tokens = common_max_new_tokens[0] + dprint(f"testing with cache: model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, cache={cache_status}") - if USE_MICRO_MODELS: + # we don't currently support inferring gptq from get_model, so we must use an adapter with hf_configured + gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path) + is_gptq = len(gptq_kwargs_aiu) != 0 + + micro_model_path = micro_model_mapping.get(model_path, None) + if USE_MICRO_MODELS and micro_model_path is None: + dprint("using randomly initialized model") micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3} else: - micro_model_kwargs = {"architecture": "hf_pretrained"} - + dprint("using trained model") + micro_model_kwargs = {"architecture": "hf_pretrained"} + if not USE_MICRO_MODELS and os.path.exists(model_path): model_path_kwargs = {"model_path": model_path} + elif USE_MICRO_MODELS and micro_model_path is not None: + model_path_kwargs = {"model_path": micro_model_path} else: model_path_kwargs = {"variant": model_path} - + distributed_kwargs = {} if USE_DISTRIBUTED: - distributed_kwargs["distr_param"] = "tp" + distributed_kwargs["distributed_strategy"] = "tp" distributed_kwargs["group"] = dist.group.WORLD - get_model_kwargs = {**model_path_kwargs, **micro_model_kwargs, **distributed_kwargs} + + get_model_kwargs = {} + if not is_gptq: + get_model_kwargs = { + **model_path_kwargs, + **micro_model_kwargs, + **distributed_kwargs, + } tokenizer = tokenizers.get_tokenizer(model_path) # prepare the AIU model model = get_model( + device_type="cpu", + data_type=None if is_gptq else torch.float16, + fused_weights=False, + **get_model_kwargs, + ) + + model.eval() + model.compile(backend="sendnn") + + # prepare the cpu model + validation_model = get_model( device_type="cpu", + data_type=None if is_gptq else torch.float32, fused_weights=False, - **get_model_kwargs + **gptq_kwargs_cpu, + **get_model_kwargs, ) - model.eval() - torch.set_grad_enabled(False) - model.compile(backend="sendnn_decoder") - + if USE_MICRO_MODELS: + serialization.load_state_dict_into_model( + validation_model, model.state_dict(), **__custom_adapter + ) # prepare input_ids - input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) + input_ids, extra_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) + extra_kwargs["attn_name"] = ATTN_NAME # warmup aiu model - warmup_model(model, input_ids, max_new_tokens, **padding_kwargs) + warmup_model(model, input_ids, max_new_tokens, compile_dynamic_sendnn, **extra_kwargs) + + # generate cpu validation info + cpu_validation_info = __load_validation_info( + model_path, batch_size, seq_length, max_new_tokens, tokenizer, 0 + ) + if cpu_validation_info is None: + cpu_validation_info = extract_validation_information( + validation_model, + input_ids, + max_new_tokens, + LogitsExtractorHook(), + attn_algorithm="math", + **extra_kwargs, + ) - # aiu validatation + if save_validation_info_outputs: + cpu_validation_info.save( + __get_validation_info_full_path( + model_path, batch_size, seq_length, max_new_tokens, 0 + ) + ) + cpu_static_tokens = cpu_validation_info.get_info("tokens") + eos_indexes = __find_eos_index( + cpu_static_tokens, tokenizer.eos_token_id, seq_length, max_new_tokens + ) + dprint( + "cpu validation info extracted for validation level 0 and validation level 1 (iter=0)" + ) + + # first test validation level 0 aiu_validation_info = extract_validation_information( - model, - input_ids, - max_new_tokens, - None, - only_last_token=True, - **padding_kwargs -) + model, input_ids, max_new_tokens, None, only_last_token="paged" not in ATTN_NAME, **extra_kwargs + ) + dprint("aiu validation info extracted for validation level 0") + + # check cache status before validating cached results + updated_cache_len = len(os.listdir(os.getcwd()+"/.cache")) if os.path.isdir(os.getcwd()+"/.cache") else 0 + if cache_status == "miss": + assert updated_cache_len == max_new_tokens, ( + "cache directory not populated on cache miss" + ) + return + else: + assert updated_cache_len == max_new_tokens, ( + "cache miss occurred when hit was expected" + ) + + # validate level 0 + failed_responses = validate_level_0( + aiu_validation_info.get_info("tokens"), cpu_static_tokens + ) + + failed_validation_level_0 = len(failed_responses) != 0 + + # if level 0 fails validation, validate level 1 + if FORCE_VALIDATION_LEVEL_1 or failed_validation_level_0: + + if failed_validation_level_0: + dprint("failed validation level 0, testing validation level 1") + else: + dprint("passed validation level 0, testing validation level 1") + + # metric calculator based on the cross-entropy and mean diff for each decode step + def _metric_calculator(r: torch.Tensor, t: torch.Tensor): + cross_entropy = torch.nn.CrossEntropyLoss()( + r, t.softmax(dim=1).to(dtype=torch.float32) + ) + diff = torch.mean( + torch.abs( + r.softmax(dim=1).to(dtype=torch.float32) + - t.softmax(dim=1).to(dtype=torch.float32) + ) + ) + return (cross_entropy, diff) + + iters = 1024 // max_new_tokens + ce_fail_responses_list = [] + diff_fail_responses_list = [] + total_tokens = 0 + for i in range(iters): + # for iteration 0, we have computed the cpu validation info in the prior step for seed=0, so skip + if i != 0: + input_ids, extra_kwargs = __prepare_inputs( + batch_size, seq_length, tokenizer, seed=i + ) + extra_kwargs["attn_name"] = ATTN_NAME + cpu_validation_info = __load_validation_info( + model_path, batch_size, seq_length, max_new_tokens, tokenizer, i + ) + if cpu_validation_info is None: + cpu_validation_info = extract_validation_information( + validation_model, + input_ids, + max_new_tokens, + LogitsExtractorHook(), + attn_algorithm="math", + **extra_kwargs, + ) + dprint( + f"cpu validation info extracted for validation level 1 - iter={i}" + ) + if save_validation_info_outputs: + cpu_validation_info.save( + __get_validation_info_full_path( + model_path, batch_size, seq_length, max_new_tokens, i + ) + ) + cpu_static_tokens = cpu_validation_info.get_info("tokens") + eos_indexes = __find_eos_index( + cpu_static_tokens, + tokenizer.eos_token_id, + seq_length, + max_new_tokens, + ) + + # generate aiu validation info + aiu_validation_info = extract_validation_information( + model, + input_ids, + max_new_tokens, + GoldenTokenHook(cpu_static_tokens), + only_last_token=ATTN_TYPE != "paged", + **extra_kwargs, + ) + dprint(f"aiu validation info extracted for validation level 1 - iter={i}") + if save_validation_info_outputs: + aiu_validation_info.save( + __get_validation_info_full_path( + model_path, batch_size, seq_length, max_new_tokens, i, "aiu" + ) + ) + + # capture all level 1 metrics + level_1_metrics = capture_level_1_metrics( + cpu_validation_info.get_info("logits"), + aiu_validation_info.get_info("logits"), + top_k_loss_calculator(20, _metric_calculator), + ) + # only consider those metrics captured prior to the eos + level_1_metrics = __filter_before_eos(level_1_metrics, eos_indexes) + + # if we do not have real model weights, use a default_metrics_threshold + if USE_MICRO_MODELS and micro_model_path is None: + ce_threshold, diff_threshold = default_metrics_threshold + # if we have real weights, try and get the proper validation metrics threshold + else: + # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds + if USE_MICRO_MODELS: + ce_threshold, diff_threshold = fail_thresholds.get( + (model_path, True), fail_thresholds.get((model_path, False), default_metrics_threshold) + ) + else: + ce_threshold, diff_threshold = fail_thresholds.get( + (model_path, False), default_metrics_threshold + ) + + # get all failed responses for each metric + ce_fail_responses = filter_failed_level_1_cases( + level_1_metrics, lambda m: m[0] >= ce_threshold + ) + diff_fail_responses = filter_failed_level_1_cases( + level_1_metrics, + lambda m: m[1] >= diff_threshold, + ) + + ce_fail_responses_list.extend(ce_fail_responses) + diff_fail_responses_list.extend(diff_fail_responses) + total_tokens += len(level_1_metrics) + + # test the failure rates for across all tokens + diff_failure_rate = len(diff_fail_responses_list) / total_tokens + ce_failure_rate = len(ce_fail_responses_list) / total_tokens + dprint(f"mean diff failure rate: {diff_failure_rate}") + dprint(f"cross entropy loss failure rate: {ce_failure_rate}") + if "mean_diff" not in skip_assertions: + assert diff_failure_rate < failure_rate_threshold, ( + f"failure rate for mean diff was too high: {diff_failure_rate}" + ) + if "ce" not in skip_assertions: + assert ce_failure_rate < failure_rate_threshold, ( + f"failure rate for cross entropy loss was too high: {ce_failure_rate}" + ) + print("passed validation level 1") + else: + print("passed validation level 0") From 81870d51c317a8fddd16fab9d5f58235f82fe4c8 Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Wed, 30 Jul 2025 05:32:12 -0600 Subject: [PATCH 14/22] Squashed commit of the following: commit ed571f728a351f8dd92737be5554c3dc46f71a30 Author: Alex-Brooks Date: Tue Jul 29 09:20:06 2025 -0600 Remove cache tests commit 2848f7b2785b91c60c536b8993c3193c40c381ea Author: Alex-Brooks Date: Mon Jul 28 08:07:01 2025 -0600 Add leading underscores, revert model name commit c30b7b70a0f6e464d3212fd9bed4f9ea33f9de93 Author: Alex-Brooks Date: Mon Jul 28 07:15:08 2025 -0600 Explictly clear cache paths commit 42aaf666d7f8ffb2fb611df7ad2d06b48e480dd7 Author: Alex-Brooks Date: Mon Jul 28 07:14:23 2025 -0600 Set the cache dir in conftest commit b978e7225f02bf1d9a5f7b919ca6cbe2ee8d641a Author: Alex-Brooks Date: Mon Jul 28 06:15:10 2025 -0600 run formatting commit 8d64df08333991927c45f9a982ddaf95f39c94cf Author: Alex-Brooks Date: Fri Jul 25 11:18:13 2025 -0600 refactor cache miss into fixture commit 0b524b8c818495cb646add2adfc27a2884ac8de5 Author: Alex-Brooks Date: Fri Jul 25 07:11:09 2025 -0600 Consolidate cache test with common commit d8a36d405a101e101ab9ede3b8d12fa3026cd01f Author: Alex-Brooks Date: Fri Jul 25 06:41:13 2025 -0600 Run cache test first commit 2efb797fb21587e9136b314c44ec56c658636826 Author: Alex-Brooks Date: Fri Jul 25 05:48:25 2025 -0600 Finish splitting out common shape test helpers commit 4ae73dea18848005f86d1c9bcdf29f153711330f Author: Alex-Brooks Date: Fri Jul 25 05:28:31 2025 -0600 refactor most of common shape test commit 083afdc3a468649ec4b0bbadc921d40b47e37498 Author: Alex-Brooks Date: Thu Jul 24 14:08:20 2025 -0600 Move torch sendnn cache dir to common commit e9b576381a738c59f91d5fc904ceaa2a0e410864 Author: Alex-Brooks Date: Thu Jul 24 14:02:06 2025 -0600 Use caps for constants, common post proc Signed-off-by: Alex-Brooks --- tests/models/conftest.py | 4 + tests/models/test_decoders.py | 997 ++++++++++++++++------------------ 2 files changed, 458 insertions(+), 543 deletions(-) diff --git a/tests/models/conftest.py b/tests/models/conftest.py index 7e716618..29fac434 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -23,6 +23,10 @@ def pytest_sessionstart(session): os.environ.setdefault("DTLOG_LEVEL", "error") os.environ.setdefault("DT_DEEPRT_VERBOSE", "-1") + # NOTE: we should configure the cachedir before importing torchsendnn's + # graph cache to prevent it from being initialized in the wrong place. + os.environ["TORCH_SENDNN_CACHE_DIR"] = os.path.join(os.getcwd(), ".cache") + def pytest_addoption(parser): parser.addoption( diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index adbeae50..a4a3e247 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -29,7 +29,6 @@ from transformers import AutoTokenizer from aiu_fms_testing_utils.utils.aiu_setup import dprint, aiu_dist_setup -import shutil import os try: @@ -50,7 +49,7 @@ GRANITE_20B_CODE_INSTRUCT_8K = "ibm-granite/granite-20b-code-instruct-8k" LLAMA_3p1_70B_INSTRUCT = "meta-llama/Llama-3.1-70B-Instruct" -micro_model_mapping = { +MICRO_MODEL_MAPPING = { LLAMA_3p1_8B_INSTRUCT: os.path.join( MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000" ), @@ -76,24 +75,24 @@ os.environ.get("FMS_TEST_SHAPES_CUMULATIVE_TEST_TOKENS_PER_SEQUENCE", "1024") ) ATTN_TYPE = os.environ.get("FMS_TEST_SHAPES_ATTN_TYPE", "sdpa") -attention_map = { +ATTENTION_MAP = { "sdpa": "sdpa_causal", "paged": "spyre_paged_attn", "math_fp8": "math_fp8", "paged_fp8": "spyre_paged_attn_fp8", } -ATTN_NAME = attention_map[ATTN_TYPE] +ATTN_NAME = ATTENTION_MAP[ATTN_TYPE] CPU_DTYPE = "fp8" if "fp8" in ATTN_TYPE else "fp32" FORCE_VALIDATION_LEVEL_1 = ( os.environ.get("FMS_TEST_SHAPES_FORCE_VALIDATION_LEVEL_1", "0") == "1" ) -skip_assertions = os.environ.get("FMS_TEST_SHAPES_SKIP_ASSERTIONS", {}) -validation_info_dir = os.environ.get( +SKIP_ASSERTIONS = os.environ.get("FMS_TEST_SHAPES_SKIP_ASSERTIONS", {}) +VALIDATION_INFO_DIR = os.environ.get( "FMS_TEST_SHAPES_VALIDATION_INFO_DIR", "/tmp/models/validation_info" ) -common_model_paths = os.environ.get( +COMMON_MODEL_PATHS = os.environ.get( "FMS_TEST_SHAPES_COMMON_MODEL_PATHS", [ LLAMA_3p1_8B_INSTRUCT, @@ -103,25 +102,25 @@ LLAMA_3p1_70B_INSTRUCT, ], ) -model_configuration_path = os.environ.get( +MODEL_CONFIGURATION_PATH = os.environ.get( "FMS_TEST_SHAPES_FROM_MODEL_CONFIGURATION", "" ) -model_configuration_frequency = os.environ.get( +MODEL_CONFIGURATION_FREQUENCY = os.environ.get( "FMS_TEST_SHAPES_FROM_MODEL_CONFIGURATION_FREQUENCY", "0" ) # for validation level 1, the default is a failure rate of 1% # set this environment variable if you would like to relax that threshold -failure_rate_threshold = os.environ.get("FMS_TEST_SHAPES_FAILURE_THRESHOLD", 0.01) -default_metrics_threshold = os.environ.get( +FAILURE_RATE_THRESHOLD = os.environ.get("FMS_TEST_SHAPES_FAILURE_THRESHOLD", 0.01) +DEFAULT_METRICS_THRESHOLD = os.environ.get( "FMS_TEST_SHAPES_METRICS_THRESHOLD", (3.0, 0.001) ) -save_validation_info_outputs = ( +SAVE_VALIDATION_INFO_OUTPUTS = ( os.environ.get("FMS_TEST_SHAPES_SAVE_VALIDATION_INFO_OUTPUTS", "0") == "1" ) -common_batch_sizes = os.environ.get("FMS_TEST_SHAPES_COMMON_BATCH_SIZES", [1, 2, 4, 8]) -common_seq_lengths = os.environ.get("FMS_TEST_SHAPES_COMMON_SEQ_LENGTHS", [64, 2048]) -common_max_new_tokens = os.environ.get("FMS_TEST_SHAPES_COMMON_MAX_NEW_TOKENS", [128]) +COMMON_BATCH_SIZES = os.environ.get("FMS_TEST_SHAPES_COMMON_BATCH_SIZES", [1, 2, 4, 8]) +COMMON_SEQ_LENGTHS = os.environ.get("FMS_TEST_SHAPES_COMMON_SEQ_LENGTHS", [64, 2048]) +COMMON_MAX_NEW_TOKENS = os.environ.get("FMS_TEST_SHAPES_COMMON_MAX_NEW_TOKENS", [128]) if USE_DISTRIBUTED: dist.init_process_group() @@ -131,69 +130,68 @@ ) if USE_MICRO_MODELS: - validation_info_dir = os.path.join(validation_info_dir, "tiny_models") + VALIDATION_INFO_DIR = os.path.join(VALIDATION_INFO_DIR, "tiny_models") # pass custom model path list for eg: EXPORT FMS_TESTING_COMMON_MODEL_PATHS="/tmp/models/granite-3-8b-base,/tmp/models/granite-7b-base" -if isinstance(common_model_paths, str): - common_model_paths = common_model_paths.split(",") +if isinstance(COMMON_MODEL_PATHS, str): + COMMON_MODEL_PATHS = COMMON_MODEL_PATHS.split(",") # pass custom failure rate threshold as float -if isinstance(failure_rate_threshold, str): - failure_rate_threshold = float(failure_rate_threshold) +if isinstance(FAILURE_RATE_THRESHOLD, str): + FAILURE_RATE_THRESHOLD = float(FAILURE_RATE_THRESHOLD) # pass custom default metrics threshold as a comma separated str of floats , -if isinstance(default_metrics_threshold, str): - default_metrics_threshold = tuple( - [float(m) for m in default_metrics_threshold.split(",")] +if isinstance(DEFAULT_METRICS_THRESHOLD, str): + DEFAULT_METRICS_THRESHOLD = tuple( + [float(m) for m in DEFAULT_METRICS_THRESHOLD.split(",")] ) # pass custom common batch sizes as a comma separated str of ints -if isinstance(common_batch_sizes, str): - common_batch_sizes = [int(bs) for bs in common_batch_sizes.split(",")] +if isinstance(COMMON_BATCH_SIZES, str): + COMMON_BATCH_SIZES = [int(bs) for bs in COMMON_BATCH_SIZES.split(",")] # pass custom common seq lengths as a comma separated str of ints -if isinstance(common_seq_lengths, str): - common_seq_lengths = [int(sl) for sl in common_seq_lengths.split(",")] +if isinstance(COMMON_SEQ_LENGTHS, str): + COMMON_SEQ_LENGTHS = [int(sl) for sl in COMMON_SEQ_LENGTHS.split(",")] # pass custom common max new tokens as a comma separated str of ints -if isinstance(common_max_new_tokens, str): - common_max_new_tokens = [int(mnt) for mnt in common_max_new_tokens.split(",")] +if isinstance(COMMON_MAX_NEW_TOKENS, str): + COMMON_MAX_NEW_TOKENS = [int(mnt) for mnt in COMMON_MAX_NEW_TOKENS.split(",")] # pass metrics to skip as a comma separated list (ce,mean_diff) -if isinstance(skip_assertions, str): +if isinstance(SKIP_ASSERTIONS, str): _skip_assertions = [] - for metric in skip_assertions.split(","): + for metric in SKIP_ASSERTIONS.split(","): metric = metric.lower() if metric not in {"ce", "mean_diff"}: pytest.fail( "FMS_TEST_SHAPES_SKIP_ASSERTIONS can only accept metrics ce and mean_diff" ) _skip_assertions.append(metric) - skip_assertions = set(_skip_assertions) + SKIP_ASSERTIONS = set(_skip_assertions) -compile_dynamic_sendnn = ATTN_TYPE == "paged" +COMPILE_DYNAMIC_SENDNN = ATTN_TYPE == "paged" -if compile_dynamic_sendnn: +if COMPILE_DYNAMIC_SENDNN: import bisect # the compiler supports certain max context lengths (VLLM_DT_MAX_CONTEXT_LEN) # this will ensure that we select smallest supported VLLM_DT_MAX_CONTEXT_LEN that fits the largest possible context (prompt size + max_new_tokens) - __largest_context = max(common_seq_lengths) + max(common_max_new_tokens) + __largest_context = max(COMMON_SEQ_LENGTHS) + max(COMMON_MAX_NEW_TOKENS) __supported_context_lengths = [256, 512, 1024, 2048, 4096, 8192, 16384, 32768] os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str( __supported_context_lengths[ bisect.bisect_left(__supported_context_lengths, __largest_context) ] ) - os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(max(common_batch_sizes), 2)) - fx_config.backed_size_oblivious = True + os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(max(COMMON_BATCH_SIZES), 2)) # thresholds are chosen based on 1024 tokens per sequence # 1% error threshold rate between cpu fp32 and cuda fp16 # if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above # threshold key is (model_id, is_tiny_model) -fail_thresholds = { +FAIL_THRESHOLDS = { (LLAMA_3p1_8B_INSTRUCT, False): ( 2.6994638133048965, 0.00047589250549208347, @@ -216,21 +214,21 @@ ), } -if model_configuration_path != "": +if MODEL_CONFIGURATION_PATH != "": print( "ignoring FMS_TEST_SHAPES_COMMON_MODEL_PATHS, FMS_TEST_SHAPES_USE_MICRO_MODELS as configuration will be set by FMS_TEST_SHAPES_FROM_MODEL_CONFIGURATION" ) USE_MICRO_MODELS = False - common_model_paths = [] - frequency = int(model_configuration_frequency) - with open(model_configuration_path, "r") as f: + COMMON_MODEL_PATHS = [] + frequency = int(MODEL_CONFIGURATION_FREQUENCY) + with open(MODEL_CONFIGURATION_PATH, "r") as f: for line in f: try: model_config = json.loads(line) if model_config["frequency"] <= frequency: - common_model_paths.append(model_config["model_id"]) + COMMON_MODEL_PATHS.append(model_config["model_id"]) # assume fullsize models - fail_thresholds[(model_config["model_id"], USE_MICRO_MODELS)] = ( + FAIL_THRESHOLDS[(model_config["model_id"], USE_MICRO_MODELS)] = ( model_config["ce"], model_config["mean_diff"], ) @@ -239,10 +237,10 @@ common_shapes = list( itertools.product( - common_model_paths, - common_batch_sizes, - common_seq_lengths, - common_max_new_tokens, + COMMON_MODEL_PATHS, + COMMON_BATCH_SIZES, + COMMON_SEQ_LENGTHS, + COMMON_MAX_NEW_TOKENS, ) ) @@ -256,11 +254,11 @@ @pytest.fixture(autouse=True) def reset_compiler(): yield # run the test - if not compile_dynamic_sendnn: + if not COMPILE_DYNAMIC_SENDNN: torch.compiler.reset() torch._dynamo.reset() os.environ.pop("COMPILATION_MODE", None) - os.environ.pop('TORCH_SENDNN_CACHE_ENABLE', None) + # TODO: Currently, gptq does not have the same level of support as non-gptq models for get_model. This method provides the extra requirements for gptq for get_model, # however ideally, these fixes should be done in foundation-model-stack. @@ -309,6 +307,7 @@ def __maybe_get_gptq_kwargs(model_path): pass return gptq_kwargs_aiu, gptq_kwargs_cpu + def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): if "paged" in ATTN_NAME: prompts_and_sizes = sample_sharegpt_requests( @@ -337,6 +336,14 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): prompt_list.append(tokenizer.encode(prompt, return_tensors="pt").squeeze(0)) input_ids, extra_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length) + extra_kwargs["attn_name"] = ATTN_NAME + if ( + "paged" in ATTN_NAME + and "ibm-granite/granite-3.3-8b-instruct" in model_path + and USE_DISTRIBUTED + and dist.get_world_size() == 4 + ): + extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT return input_ids, extra_kwargs @@ -374,7 +381,7 @@ def __load_validation_info( ): # if path doesn't exist and paged isn't in the attention name, remove `attn_type` and recheck again, warn that we will no longer in the future have paths without 'attn_type' full_path = find_validation_info_path( - validation_info_dir, + VALIDATION_INFO_DIR, model_path, batch_size, seq_length, @@ -411,10 +418,10 @@ def get_or_create(self, is_gptq, is_fp8, **kwargs): model.eval() model.compile( - backend="sendnn", options={"sendnn.dynamic": compile_dynamic_sendnn} + backend="sendnn", options={"sendnn.dynamic": COMPILE_DYNAMIC_SENDNN} ) - if compile_dynamic_sendnn: + if COMPILE_DYNAMIC_SENDNN: self.model = model return model @@ -463,31 +470,54 @@ def persistent_model(): return PersistentModel() -@pytest.mark.parametrize( - "model_path,batch_size,seq_length,max_new_tokens", common_shapes -) -def test_common_shapes( - model_path, - batch_size, - seq_length, - max_new_tokens, - persistent_model, - record_property, +##### Common utils +# metric calculator based on the cross-entropy and mean diff for each decode step +def _metric_calculator(r: torch.Tensor, t: torch.Tensor): + cross_entropy = torch.nn.CrossEntropyLoss()( + r, t.softmax(dim=1).to(dtype=torch.float32) + ) + diff = torch.mean( + torch.abs( + r.softmax(dim=1).to(dtype=torch.float32) + - t.softmax(dim=1).to(dtype=torch.float32) + ) + ) + return (cross_entropy, diff) + + +def _check_failure_thresholds( + diff_fail_responses_list, ce_fail_responses_list, total_tokens ): - torch.manual_seed(42) - torch.set_grad_enabled(False) - os.environ["COMPILATION_MODE"] = "offline_decoder" + # test the failure rates for across all tokens + diff_failure_rate = len(diff_fail_responses_list) / total_tokens + ce_failure_rate = len(ce_fail_responses_list) / total_tokens + dprint(f"mean diff failure rate: {diff_failure_rate}") + dprint(f"cross entropy loss failure rate: {ce_failure_rate}") + + # Add failure rates to xml report + record_property("mean_diff_failure_rate", diff_failure_rate) + record_property("cross_entropy_loss_failure_rate", ce_failure_rate) + + if "mean_diff" not in SKIP_ASSERTIONS: + assert diff_failure_rate < FAILURE_RATE_THRESHOLD, ( + f"failure rate for mean diff was too high: {diff_failure_rate}" + ) + if "ce" not in SKIP_ASSERTIONS: + assert ce_failure_rate < FAILURE_RATE_THRESHOLD, ( + f"failure rate for cross entropy loss was too high: {ce_failure_rate}" + ) + print("passed validation level 1") + else: + print("passed validation level 0") - dprint( - f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, attn_type={ATTN_TYPE}" - ) - # we don't currently support inferring gptq from get_model, so we must use an adapter with hf_configured - gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path) - is_gptq = len(gptq_kwargs_aiu) != 0 - is_fp8 = "fp8" in ATTN_NAME +def _get_common_model_kwargs(is_gptq, model_path): + if is_gptq: + return {} + # Get the micro model kwargs + # TODO clean up path handling for micro models + micro_model_path = MICRO_MODEL_MAPPING.get(model_path, None) - micro_model_path = micro_model_mapping.get(model_path, None) if USE_MICRO_MODELS and micro_model_path is None: dprint("using randomly initialized model") micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3} @@ -495,6 +525,7 @@ def test_common_shapes( dprint("using trained model") micro_model_kwargs = {"architecture": "hf_pretrained"} + # Get the model path kwargs if not USE_MICRO_MODELS and os.path.exists(model_path): model_path_kwargs = {"model_path": model_path} elif USE_MICRO_MODELS and micro_model_path is not None: @@ -502,84 +533,188 @@ def test_common_shapes( else: model_path_kwargs = {"variant": model_path} + # Get the distributed kwargs distributed_kwargs = {} if USE_DISTRIBUTED: distributed_kwargs["distributed_strategy"] = "tp" distributed_kwargs["group"] = dist.group.WORLD - get_model_kwargs = {} - if not is_gptq: - get_model_kwargs = { - **model_path_kwargs, - **micro_model_kwargs, - **distributed_kwargs, - } + return { + **model_path_kwargs, + **micro_model_kwargs, + **distributed_kwargs, + } - tokenizer = AutoTokenizer.from_pretrained(model_path) - # prepare the AIU model - model = persistent_model.get_or_create( - is_gptq, is_fp8, **gptq_kwargs_aiu, **get_model_kwargs - ) +# NOTE micro_model_state_dict should be None if USE_MICRO_MODELS is true +# Otherwise it should be model.state_dict() where model is the AIU model +def _get_cpu_model(model_path, gptq_kwargs, micro_model_state_dict=None): + is_gptq = len(gptq_kwargs) != 0 + model_kwargs = _get_common_model_kwargs(is_gptq, model_path) # prepare the cpu model validation_model = get_model( device_type="cpu", - data_type=None if is_fp8 or is_gptq else torch.float32, + data_type=None if is_gptq else torch.float32, fused_weights=False, - **gptq_kwargs_cpu, - **get_model_kwargs, + **gptq_kwargs, + **model_kwargs, ) - if USE_MICRO_MODELS: + # This is a micro model, so we need to copy the state dict directly. + if micro_model_state_dict is not None: serialization.load_state_dict_into_model( - validation_model, model.state_dict(), **__custom_adapter + validation_model, micro_model_state_dict, **__custom_adapter ) + return validation_model - # prepare input_ids - input_ids, extra_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) - extra_kwargs["attn_name"] = ATTN_NAME - if ( - "paged" in ATTN_NAME - and "ibm-granite/granite-3.3-8b-instruct" in model_path - and USE_DISTRIBUTED - and dist.get_world_size() == 4 - ): - extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT - # warmup aiu model - warmup_model( - model, input_ids, max_new_tokens, compile_dynamic_sendnn, **extra_kwargs - ) +def _get_aiu_model(model_path, gptq_kwargs, persistent_model_inst): + is_gptq = len(gptq_kwargs) != 0 + is_fp8 = "fp8" in ATTN_NAME + model_kwargs = _get_common_model_kwargs(is_gptq, model_path) - # generate cpu validation info - cpu_validation_info = __load_validation_info( - model_path, batch_size, seq_length, max_new_tokens, tokenizer, 0, ATTN_NAME - ) - if cpu_validation_info is None: - cpu_validation_info = extract_validation_information( - validation_model, - input_ids, + # prepare the AIU model; use the persistent model fixure if the test has it + if persistent_model_inst is not None: + aiu_model = persistent_model_inst.get_or_create( + is_gptq, is_fp8, **gptq_kwargs, **model_kwargs + ) + # otherwise create it directly + else: + aiu_model = get_model( + device_type="cpu", + data_type=None if is_gptq else torch.float16, + fused_weights=False, + **gptq_kwargs, + **model_kwargs, + ) + aiu_model.eval() + aiu_model.compile( + backend="sendnn", + options={"sendnn.dynamic": COMPILE_DYNAMIC_SENDNN}, + ) + return aiu_model + + +def _get_device_validation_information( + model_path, + batch_size, + seq_length, + max_new_tokens, + post_iteration_hook, + model, + input_ids, + extra_kwargs, + token_iter, + device="aiu", + tokenizer=None, + only_last_token=None, +): + # For CPU, we try to load it from disk first if it exists + if device == "cpu": + cpu_validation_info = __load_validation_info( + model_path, + batch_size, + seq_length, max_new_tokens, - LogitsExtractorHook(), - attn_algorithm="math", - timing=TIMING, - **extra_kwargs, + tokenizer, + token_iter, + ATTN_NAME, # TODO checkme ) - if save_validation_info_outputs: - cpu_validation_info.save( - get_validation_info_path( - validation_info_dir, - model_path, - batch_size, - seq_length, - max_new_tokens, - 0, - ATTN_NAME, - dtype=CPU_DTYPE, - ) + if cpu_validation_info is not None: + return cpu_validation_info + + # Don't save iter 0 for AIU only + skip_save = device == "aiu" and token_iter == 0 + # overrides for validation info that are device specific + device_dependent_kwargs = {} + if device == "cpu": + device_dependent_kwargs["attn_algorithm"] = "math" + + if device == "aiu" and only_last_token is not None: + device_dependent_kwargs["only_last_token"] = only_last_token + device_dependent_kwargs["last_n_tokens"] = 64 if "paged" in ATTN_NAME else 1 + + # Otherwise we need to get the AIU / CPU validation info + validation_info = extract_validation_information( + model, + input_ids, + max_new_tokens, + post_iteration_hook, + timing=TIMING, + **extra_kwargs, + **device_dependent_kwargs, + ) + + if not skip_save and SAVE_VALIDATION_INFO_OUTPUTS: + dprint(f"saving {device} validation for - iter={token_iter}") + # TODO - there is probably a cleaner way to handle this too + kwargs = {} + if device == "cpu": + kwargs["dtype"] = CPU_DTYPE + + validation_info.save( + get_validation_info_path( + validation_info_dir, + model_path, + batch_size, + seq_length, + max_new_tokens, + token_iter, + ATTN_NAME, + device_type=device, + **kwargs, + ) + ) + return validation_info + + +def _resolve_thresholds(model_path, micro_model_path): + # if we do not have real model weights, use a default_metrics_threshold + if USE_MICRO_MODELS and micro_model_path is None: + ce_threshold, diff_threshold = DEFAULT_METRICS_THRESHOLD + # if we have real weights, try and get the proper validation metrics threshold + else: + # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds + if USE_MICRO_MODELS: + ce_threshold, diff_threshold = FAIL_THRESHOLDS.get( + (model_path, True), + FAIL_THRESHOLDS.get((model_path, False), DEFAULT_METRICS_THRESHOLD), ) + else: + ce_threshold, diff_threshold = FAIL_THRESHOLDS.get( + (model_path, False), DEFAULT_METRICS_THRESHOLD + ) + return ce_threshold, diff_threshold + + +def _run_validation_level_0( + model_path, + batch_size, + seq_length, + max_new_tokens, + tokenizer, + validation_model, + input_ids, + extra_kwargs, + model, +): + cpu_validation_info = _get_device_validation_information( + model_path=model_path, + batch_size=batch_size, + seq_length=seq_length, + max_new_tokens=max_new_tokens, + post_iteration_hook=LogitsExtractorHook(), + model=validation_model, + input_ids=input_ids, + extra_kwargs=extra_kwargs, + token_iter=0, + device="cpu", + tokenizer=tokenizer, + ) + + # Get the cpu static toks / initial eos sequences for iter 0 cpu_static_tokens = cpu_validation_info.get_info("tokens") eos_indexes = __find_eos_index( cpu_static_tokens, tokenizer.eos_token_id, seq_length, max_new_tokens @@ -589,14 +724,19 @@ def test_common_shapes( ) # first test validation level 0 - aiu_validation_info = extract_validation_information( - model, - input_ids, - max_new_tokens, - None, - last_n_tokens=64 if "paged" in ATTN_NAME else 1, - timing=TIMING, - **extra_kwargs, + aiu_validation_info = _get_device_validation_information( + model_path=model_path, + batch_size=batch_size, + seq_length=seq_length, + max_new_tokens=max_new_tokens, + post_iteration_hook=None, + model=model, + input_ids=input_ids, + extra_kwargs=extra_kwargs, + token_iter=0, + device="aiu", + tokenizer=tokenizer, + only_last_token="paged" not in ATTN_NAME, ) dprint("aiu validation info extracted for validation level 0") @@ -605,442 +745,213 @@ def test_common_shapes( aiu_validation_info.get_info("tokens"), cpu_static_tokens ) - failed_validation_level_0 = len(failed_responses) != 0 + # Keep things we may need on the first iter for validation 1 + validation_zero_info = { + "cpu_validation_info": cpu_validation_info, + "cpu_static_tokens": cpu_static_tokens, + "eos_indexes": eos_indexes, + } + return len(failed_responses) != 0, validation_zero_info - # if level 0 fails validation, validate level 1 - if FORCE_VALIDATION_LEVEL_1 or failed_validation_level_0: - if failed_validation_level_0: - dprint("failed validation level 0, testing validation level 1") - else: - dprint("passed validation level 0, testing validation level 1") - - # metric calculator based on the cross-entropy and mean diff for each decode step - def _metric_calculator(r: torch.Tensor, t: torch.Tensor): - cross_entropy = torch.nn.CrossEntropyLoss()( - r, t.softmax(dim=1).to(dtype=torch.float32) - ) - diff = torch.mean( - torch.abs( - r.softmax(dim=1).to(dtype=torch.float32) - - t.softmax(dim=1).to(dtype=torch.float32) - ) - ) - return (cross_entropy, diff) - - iters = int(CUMULATIVE_TEST_TOKENS_PER_SEQUENCE) // max_new_tokens - ce_fail_responses_list = [] - diff_fail_responses_list = [] - total_tokens = 0 - for i in range(iters): - # for iteration 0, we have computed the cpu validation info in the prior step for seed=0, so skip - if i != 0: - input_ids, extra_kwargs = __prepare_inputs( - batch_size, seq_length, tokenizer, seed=i - ) - extra_kwargs["attn_name"] = ATTN_NAME - if ( - "paged" in ATTN_NAME - and "ibm-granite/granite-3.3-8b-instruct" in model_path - and USE_DISTRIBUTED - and dist.get_world_size() == 4 - ): - extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT - - cpu_validation_info = __load_validation_info( - model_path, - batch_size, - seq_length, - max_new_tokens, - tokenizer, - i, - ATTN_NAME, - ) - if cpu_validation_info is None: - cpu_validation_info = extract_validation_information( - validation_model, - input_ids, - max_new_tokens, - LogitsExtractorHook(), - attn_algorithm="math", - timing=TIMING, - **extra_kwargs, - ) - dprint( - f"cpu validation info extracted for validation level 1 - iter={i}" - ) - if save_validation_info_outputs: - cpu_validation_info.save( - get_validation_info_path( - validation_info_dir, - model_path, - batch_size, - seq_length, - max_new_tokens, - i, - ATTN_NAME, - dtype=CPU_DTYPE, - ) - ) - cpu_static_tokens = cpu_validation_info.get_info("tokens") - eos_indexes = __find_eos_index( - cpu_static_tokens, - tokenizer.eos_token_id, - seq_length, - max_new_tokens, - ) - - # generate aiu validation info - aiu_validation_info = extract_validation_information( - model, - input_ids, - max_new_tokens, - GoldenTokenHook(cpu_static_tokens), - last_n_tokens=64 if "paged" in ATTN_NAME else 1, - timing=TIMING, - **extra_kwargs, - ) - dprint(f"aiu validation info extracted for validation level 1 - iter={i}") - if save_validation_info_outputs: - aiu_validation_info.save( - get_validation_info_path( - validation_info_dir, - model_path, - batch_size, - seq_length, - max_new_tokens, - i, - ATTN_NAME, - device_type="aiu", - ) - ) - # capture all level 1 metrics - level_1_metrics = capture_level_1_metrics( - cpu_validation_info.get_info("logits"), - aiu_validation_info.get_info("logits"), - top_k_loss_calculator(20, _metric_calculator), - ) - # only consider those metrics captured prior to the eos - level_1_metrics = __filter_before_eos(level_1_metrics, eos_indexes) - - # if we do not have real model weights, use a default_metrics_threshold - if USE_MICRO_MODELS and micro_model_path is None: - ce_threshold, diff_threshold = default_metrics_threshold - # if we have real weights, try and get the proper validation metrics threshold - else: - # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds - if USE_MICRO_MODELS: - ce_threshold, diff_threshold = fail_thresholds.get( - (model_path, True), - fail_thresholds.get( - (model_path, False), default_metrics_threshold - ), - ) - else: - ce_threshold, diff_threshold = fail_thresholds.get( - (model_path, False), default_metrics_threshold - ) - - # get all failed responses for each metric - ce_fail_responses = filter_failed_level_1_cases( - level_1_metrics, lambda m: m[0] >= ce_threshold - ) - diff_fail_responses = filter_failed_level_1_cases( - level_1_metrics, - lambda m: m[1] >= diff_threshold, +def _run_validation_level_1( + model_path, + batch_size, + seq_length, + max_new_tokens, + tokenizer, + validation_model, + input_ids, + extra_kwargs, + model, + micro_model_path, + validation_zero_info, +): + iters = int(CUMULATIVE_TEST_TOKENS_PER_SEQUENCE) // max_new_tokens + ce_fail_responses_list = [] + diff_fail_responses_list = [] + total_tokens = 0 + for i in range(iters): + # for iteration 0, we have computed the cpu validation info in the prior step for seed=0, so skip + if i != 0: + cpu_validation_info = _get_device_validation_information( + model_path=model_path, + batch_size=batch_size, + seq_length=seq_length, + max_new_tokens=max_new_tokens, + post_iteration_hook=LogitsExtractorHook(), + model=validation_model, + input_ids=input_ids, + extra_kwargs=extra_kwargs, + token_iter=i, + device="cpu", + tokenizer=tokenizer, ) + dprint(f"cpu validation info extracted for validation level 1 - iter={i}") - ce_fail_responses_list.extend(ce_fail_responses) - diff_fail_responses_list.extend(diff_fail_responses) - total_tokens += len(level_1_metrics) - - # test the failure rates for across all tokens - diff_failure_rate = len(diff_fail_responses_list) / total_tokens - ce_failure_rate = len(ce_fail_responses_list) / total_tokens - dprint(f"mean diff failure rate: {diff_failure_rate}") - dprint(f"cross entropy loss failure rate: {ce_failure_rate}") - # Add failure rates to xml report - record_property("mean_diff_failure_rate", diff_failure_rate) - record_property("cross_entropy_loss_failure_rate", ce_failure_rate) - if "mean_diff" not in skip_assertions: - assert diff_failure_rate < failure_rate_threshold, ( - f"failure rate for mean diff was too high: {diff_failure_rate}" - ) - if "ce" not in skip_assertions: - assert ce_failure_rate < failure_rate_threshold, ( - f"failure rate for cross entropy loss was too high: {ce_failure_rate}" + cpu_static_tokens = cpu_validation_info.get_info("tokens") + eos_indexes = __find_eos_index( + cpu_static_tokens, + tokenizer.eos_token_id, + seq_length, + max_new_tokens, ) + else: + # TODO this can be cleaned up further + cpu_validation_info = validation_zero_info["cpu_validation_info"] + cpu_static_tokens = validation_zero_info["cpu_static_tokens"] + eos_indexes = validation_zero_info["eos_indexes"] + + aiu_validation_info = _get_device_validation_information( + model_path=model_path, + batch_size=batch_size, + seq_length=seq_length, + max_new_tokens=max_new_tokens, + post_iteration_hook=GoldenTokenHook(cpu_static_tokens), + model=model, + input_ids=input_ids, + extra_kwargs=extra_kwargs, + token_iter=i, + device="aiu", + tokenizer=tokenizer, + only_last_token=ATTN_TYPE != "paged", + ) + dprint(f"aiu validation info extracted for validation level 1 - iter={i}") - print("passed validation level 1") - else: - print("passed validation level 0") - -@pytest.mark.parametrize("cache_status", ["miss", "hit"]) -def test_cache(cache_status): - torch.manual_seed(42) - torch.set_grad_enabled(False) - os.environ["TORCH_SENDNN_CACHE_ENABLE"] = "1" - os.environ["TORCH_SENDNN_CACHE_DIR"] = os.getcwd()+"/.cache" - os.environ["COMPILATION_MODE"] = "offline_decoder" - - if cache_status == "miss" and os.path.isdir(os.getcwd()+"/.cache"): - # Remove cache from previous runs - shutil.rmtree(os.getcwd()+"/.cache") - - model_path = "ibm-granite/granite-3.3-8b-instruct" - batch_size = common_batch_sizes[0] - seq_length = common_seq_lengths[0] - max_new_tokens = common_max_new_tokens[0] - - dprint(f"testing with cache: model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, cache={cache_status}") - - # we don't currently support inferring gptq from get_model, so we must use an adapter with hf_configured - gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path) - is_gptq = len(gptq_kwargs_aiu) != 0 - - micro_model_path = micro_model_mapping.get(model_path, None) - if USE_MICRO_MODELS and micro_model_path is None: - dprint("using randomly initialized model") - micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3} - else: - dprint("using trained model") - micro_model_kwargs = {"architecture": "hf_pretrained"} - - if not USE_MICRO_MODELS and os.path.exists(model_path): - model_path_kwargs = {"model_path": model_path} - elif USE_MICRO_MODELS and micro_model_path is not None: - model_path_kwargs = {"model_path": micro_model_path} - else: - model_path_kwargs = {"variant": model_path} - - distributed_kwargs = {} - if USE_DISTRIBUTED: - distributed_kwargs["distributed_strategy"] = "tp" - distributed_kwargs["group"] = dist.group.WORLD - - get_model_kwargs = {} - if not is_gptq: - get_model_kwargs = { - **model_path_kwargs, - **micro_model_kwargs, - **distributed_kwargs, - } + # capture all level 1 metrics + level_1_metrics = capture_level_1_metrics( + cpu_validation_info.get_info("logits"), + aiu_validation_info.get_info("logits"), + top_k_loss_calculator(20, _metric_calculator), + ) + # only consider those metrics captured prior to the eos + level_1_metrics = __filter_before_eos(level_1_metrics, eos_indexes) - tokenizer = tokenizers.get_tokenizer(model_path) + ce_threshold, diff_threshold = _resolve_thresholds(model_path, micro_model_path) - # prepare the AIU model - model = get_model( - device_type="cpu", - data_type=None if is_gptq else torch.float16, - fused_weights=False, - **get_model_kwargs, - ) + # get all failed responses for each metric + ce_fail_responses = filter_failed_level_1_cases( + level_1_metrics, lambda m: m[0] >= ce_threshold + ) + diff_fail_responses = filter_failed_level_1_cases( + level_1_metrics, + lambda m: m[1] >= diff_threshold, + ) - model.eval() - model.compile(backend="sendnn") + ce_fail_responses_list.extend(ce_fail_responses) + diff_fail_responses_list.extend(diff_fail_responses) + total_tokens += len(level_1_metrics) - # prepare the cpu model - validation_model = get_model( - device_type="cpu", - data_type=None if is_gptq else torch.float32, - fused_weights=False, - **gptq_kwargs_cpu, - **get_model_kwargs, + _check_failure_thresholds( + diff_fail_responses_list, ce_fail_responses_list, total_tokens ) - if USE_MICRO_MODELS: - serialization.load_state_dict_into_model( - validation_model, model.state_dict(), **__custom_adapter - ) + +##### Test definitions +def _run_cpu_aiu_validation_test( + model_path, + batch_size, + seq_length, + max_new_tokens, + cpu_model, + aiu_model, + micro_model_path, +): + # Get the tokenizer and AIU / CPU models to compare + tokenizer = AutoTokenizer.from_pretrained(model_path) # prepare input_ids input_ids, extra_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) + extra_kwargs["attn_name"] = ATTN_NAME + if ( + "paged" in ATTN_NAME + and "ibm-granite/granite-3.3-8b-instruct" in model_path + and USE_DISTRIBUTED + and dist.get_world_size() == 4 + ): + extra_kwargs["_kvcache_num_blocks_hint"] = KVCACHE_NUM_BLOCKS_HINT # warmup aiu model - warmup_model(model, input_ids, max_new_tokens, compile_dynamic_sendnn, **extra_kwargs) - - # generate cpu validation info - cpu_validation_info = __load_validation_info( - model_path, batch_size, seq_length, max_new_tokens, tokenizer, 0 - ) - if cpu_validation_info is None: - cpu_validation_info = extract_validation_information( - validation_model, - input_ids, - max_new_tokens, - LogitsExtractorHook(), - attn_algorithm="math", - **extra_kwargs, - ) - - if save_validation_info_outputs: - cpu_validation_info.save( - __get_validation_info_full_path( - model_path, batch_size, seq_length, max_new_tokens, 0 - ) - ) - cpu_static_tokens = cpu_validation_info.get_info("tokens") - eos_indexes = __find_eos_index( - cpu_static_tokens, tokenizer.eos_token_id, seq_length, max_new_tokens - ) - dprint( - "cpu validation info extracted for validation level 0 and validation level 1 (iter=0)" - ) - - # first test validation level 0 - aiu_validation_info = extract_validation_information( - model, input_ids, max_new_tokens, None, only_last_token="paged" not in ATTN_NAME, **extra_kwargs + warmup_model( + aiu_model, input_ids, max_new_tokens, COMPILE_DYNAMIC_SENDNN, **extra_kwargs ) - dprint("aiu validation info extracted for validation level 0") - - # check cache status before validating cached results - updated_cache_len = len(os.listdir(os.getcwd()+"/.cache")) if os.path.isdir(os.getcwd()+"/.cache") else 0 - if cache_status == "miss": - assert updated_cache_len == max_new_tokens, ( - "cache directory not populated on cache miss" - ) - return - else: - assert updated_cache_len == max_new_tokens, ( - "cache miss occurred when hit was expected" - ) - # validate level 0 - failed_responses = validate_level_0( - aiu_validation_info.get_info("tokens"), cpu_static_tokens + # Run validation level 0 + failed_validation_level_0, validation_zero_info = _run_validation_level_0( + model_path, + batch_size, + seq_length, + max_new_tokens, + tokenizer, + cpu_model, + input_ids, + extra_kwargs, + aiu_model, ) - failed_validation_level_0 = len(failed_responses) != 0 - # if level 0 fails validation, validate level 1 if FORCE_VALIDATION_LEVEL_1 or failed_validation_level_0: - if failed_validation_level_0: dprint("failed validation level 0, testing validation level 1") else: dprint("passed validation level 0, testing validation level 1") + _run_validation_level_1( + model_path, + batch_size, + seq_length, + max_new_tokens, + tokenizer, + cpu_model, + input_ids, + extra_kwargs, + aiu_model, + micro_model_path, + validation_zero_info, + ) - # metric calculator based on the cross-entropy and mean diff for each decode step - def _metric_calculator(r: torch.Tensor, t: torch.Tensor): - cross_entropy = torch.nn.CrossEntropyLoss()( - r, t.softmax(dim=1).to(dtype=torch.float32) - ) - diff = torch.mean( - torch.abs( - r.softmax(dim=1).to(dtype=torch.float32) - - t.softmax(dim=1).to(dtype=torch.float32) - ) - ) - return (cross_entropy, diff) - - iters = 1024 // max_new_tokens - ce_fail_responses_list = [] - diff_fail_responses_list = [] - total_tokens = 0 - for i in range(iters): - # for iteration 0, we have computed the cpu validation info in the prior step for seed=0, so skip - if i != 0: - input_ids, extra_kwargs = __prepare_inputs( - batch_size, seq_length, tokenizer, seed=i - ) - extra_kwargs["attn_name"] = ATTN_NAME - cpu_validation_info = __load_validation_info( - model_path, batch_size, seq_length, max_new_tokens, tokenizer, i - ) - if cpu_validation_info is None: - cpu_validation_info = extract_validation_information( - validation_model, - input_ids, - max_new_tokens, - LogitsExtractorHook(), - attn_algorithm="math", - **extra_kwargs, - ) - dprint( - f"cpu validation info extracted for validation level 1 - iter={i}" - ) - if save_validation_info_outputs: - cpu_validation_info.save( - __get_validation_info_full_path( - model_path, batch_size, seq_length, max_new_tokens, i - ) - ) - cpu_static_tokens = cpu_validation_info.get_info("tokens") - eos_indexes = __find_eos_index( - cpu_static_tokens, - tokenizer.eos_token_id, - seq_length, - max_new_tokens, - ) - - # generate aiu validation info - aiu_validation_info = extract_validation_information( - model, - input_ids, - max_new_tokens, - GoldenTokenHook(cpu_static_tokens), - only_last_token=ATTN_TYPE != "paged", - **extra_kwargs, - ) - dprint(f"aiu validation info extracted for validation level 1 - iter={i}") - if save_validation_info_outputs: - aiu_validation_info.save( - __get_validation_info_full_path( - model_path, batch_size, seq_length, max_new_tokens, i, "aiu" - ) - ) - # capture all level 1 metrics - level_1_metrics = capture_level_1_metrics( - cpu_validation_info.get_info("logits"), - aiu_validation_info.get_info("logits"), - top_k_loss_calculator(20, _metric_calculator), - ) - # only consider those metrics captured prior to the eos - level_1_metrics = __filter_before_eos(level_1_metrics, eos_indexes) +@pytest.mark.parametrize( + "model_path,batch_size,seq_length,max_new_tokens", common_shapes +) +def test_common_shapes( + model_path, + batch_size, + seq_length, + max_new_tokens, + persistent_model, + record_property, +): + torch.manual_seed(42) + torch.set_grad_enabled(False) + os.environ["COMPILATION_MODE"] = "offline_decoder" + micro_model_path = MICRO_MODEL_MAPPING.get(model_path, None) - # if we do not have real model weights, use a default_metrics_threshold - if USE_MICRO_MODELS and micro_model_path is None: - ce_threshold, diff_threshold = default_metrics_threshold - # if we have real weights, try and get the proper validation metrics threshold - else: - # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds - if USE_MICRO_MODELS: - ce_threshold, diff_threshold = fail_thresholds.get( - (model_path, True), fail_thresholds.get((model_path, False), default_metrics_threshold) - ) - else: - ce_threshold, diff_threshold = fail_thresholds.get( - (model_path, False), default_metrics_threshold - ) + dprint( + f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, attn_type={ATTN_TYPE}" + ) - # get all failed responses for each metric - ce_fail_responses = filter_failed_level_1_cases( - level_1_metrics, lambda m: m[0] >= ce_threshold - ) - diff_fail_responses = filter_failed_level_1_cases( - level_1_metrics, - lambda m: m[1] >= diff_threshold, - ) + # we don't currently support inferring gptq from get_model, so we must use an adapter with hf_configured + gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path) - ce_fail_responses_list.extend(ce_fail_responses) - diff_fail_responses_list.extend(diff_fail_responses) - total_tokens += len(level_1_metrics) - - # test the failure rates for across all tokens - diff_failure_rate = len(diff_fail_responses_list) / total_tokens - ce_failure_rate = len(ce_fail_responses_list) / total_tokens - dprint(f"mean diff failure rate: {diff_failure_rate}") - dprint(f"cross entropy loss failure rate: {ce_failure_rate}") - if "mean_diff" not in skip_assertions: - assert diff_failure_rate < failure_rate_threshold, ( - f"failure rate for mean diff was too high: {diff_failure_rate}" - ) - if "ce" not in skip_assertions: - assert ce_failure_rate < failure_rate_threshold, ( - f"failure rate for cross entropy loss was too high: {ce_failure_rate}" - ) - print("passed validation level 1") - else: - print("passed validation level 0") + model = _get_aiu_model( + model_path, + gptq_kwargs_aiu, + persistent_model_inst=persistent_model, + ) + + validation_model = _get_cpu_model( + model_path, + gptq_kwargs_cpu, + micro_model_state_dict=model.state_dict() if USE_MICRO_MODELS else None, + ) + + _run_cpu_aiu_validation_test( + model_path, + batch_size, + seq_length, + max_new_tokens, + validation_model, + model, + micro_model_path, + ) From ad3a5847b174380604a01d9ac6c88847bd8b492d Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Thu, 11 Sep 2025 10:50:53 +0000 Subject: [PATCH 15/22] don't skip save on aiu iter0 Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index a4a3e247..a5f786d1 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -625,8 +625,6 @@ def _get_device_validation_information( if cpu_validation_info is not None: return cpu_validation_info - # Don't save iter 0 for AIU only - skip_save = device == "aiu" and token_iter == 0 # overrides for validation info that are device specific device_dependent_kwargs = {} if device == "cpu": @@ -646,8 +644,7 @@ def _get_device_validation_information( **extra_kwargs, **device_dependent_kwargs, ) - - if not skip_save and SAVE_VALIDATION_INFO_OUTPUTS: + if SAVE_VALIDATION_INFO_OUTPUTS: dprint(f"saving {device} validation for - iter={token_iter}") # TODO - there is probably a cleaner way to handle this too kwargs = {} From 54cc09ba883f5f60c81711333d7039e43919325f Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Thu, 11 Sep 2025 11:10:48 +0000 Subject: [PATCH 16/22] fix fp8 dtype, always use persistent model fixture Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 52 ++++++++++------------------------- 1 file changed, 14 insertions(+), 38 deletions(-) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index a5f786d1..b43aeba3 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -548,17 +548,13 @@ def _get_common_model_kwargs(is_gptq, model_path): # NOTE micro_model_state_dict should be None if USE_MICRO_MODELS is true # Otherwise it should be model.state_dict() where model is the AIU model -def _get_cpu_model(model_path, gptq_kwargs, micro_model_state_dict=None): - is_gptq = len(gptq_kwargs) != 0 - model_kwargs = _get_common_model_kwargs(is_gptq, model_path) - +def _get_cpu_model(model_path, is_gptq, is_fp8, micro_model_state_dict=None, **kwargs): # prepare the cpu model validation_model = get_model( device_type="cpu", - data_type=None if is_gptq else torch.float32, + data_type=None if is_fp8 or is_gptq else torch.float32, fused_weights=False, - **gptq_kwargs, - **model_kwargs, + **kwargs, ) # This is a micro model, so we need to copy the state dict directly. @@ -569,32 +565,6 @@ def _get_cpu_model(model_path, gptq_kwargs, micro_model_state_dict=None): return validation_model -def _get_aiu_model(model_path, gptq_kwargs, persistent_model_inst): - is_gptq = len(gptq_kwargs) != 0 - is_fp8 = "fp8" in ATTN_NAME - model_kwargs = _get_common_model_kwargs(is_gptq, model_path) - - # prepare the AIU model; use the persistent model fixure if the test has it - if persistent_model_inst is not None: - aiu_model = persistent_model_inst.get_or_create( - is_gptq, is_fp8, **gptq_kwargs, **model_kwargs - ) - # otherwise create it directly - else: - aiu_model = get_model( - device_type="cpu", - data_type=None if is_gptq else torch.float16, - fused_weights=False, - **gptq_kwargs, - **model_kwargs, - ) - aiu_model.eval() - aiu_model.compile( - backend="sendnn", - options={"sendnn.dynamic": COMPILE_DYNAMIC_SENDNN}, - ) - return aiu_model - def _get_device_validation_information( model_path, @@ -931,16 +901,22 @@ def test_common_shapes( # we don't currently support inferring gptq from get_model, so we must use an adapter with hf_configured gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path) - model = _get_aiu_model( - model_path, - gptq_kwargs_aiu, - persistent_model_inst=persistent_model, + is_gptq = len(gptq_kwargs_aiu) != 0 + is_fp8 = "fp8" in ATTN_NAME + model_kwargs = _get_common_model_kwargs(is_gptq, model_path) + + # Get the AIU model w/ the persistent model fixture + model = persistent_model.get_or_create( + is_gptq, is_fp8, **gptq_kwargs_aiu, **model_kwargs ) validation_model = _get_cpu_model( model_path, - gptq_kwargs_cpu, + is_gptq, + is_fp8, micro_model_state_dict=model.state_dict() if USE_MICRO_MODELS else None, + **gptq_kwargs_cpu, + **model_kwargs, ) _run_cpu_aiu_validation_test( From 75eb5fb5a38348efd7c3ad3618b5a864c92dd39f Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Thu, 11 Sep 2025 12:09:10 +0000 Subject: [PATCH 17/22] remove model path from get_cpu_model args Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index b43aeba3..8f3e067a 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -548,7 +548,7 @@ def _get_common_model_kwargs(is_gptq, model_path): # NOTE micro_model_state_dict should be None if USE_MICRO_MODELS is true # Otherwise it should be model.state_dict() where model is the AIU model -def _get_cpu_model(model_path, is_gptq, is_fp8, micro_model_state_dict=None, **kwargs): +def _get_cpu_model(is_gptq, is_fp8, micro_model_state_dict=None, **kwargs): # prepare the cpu model validation_model = get_model( device_type="cpu", @@ -911,7 +911,6 @@ def test_common_shapes( ) validation_model = _get_cpu_model( - model_path, is_gptq, is_fp8, micro_model_state_dict=model.state_dict() if USE_MICRO_MODELS else None, From f1a810eafee5cddbaf44f35d5139147b2bcf20ff Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Mon, 6 Oct 2025 19:22:29 +0000 Subject: [PATCH 18/22] fix casing error Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index 8f3e067a..e04e2b03 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -125,7 +125,7 @@ if USE_DISTRIBUTED: dist.init_process_group() aiu_dist_setup(dist.get_rank(), dist.get_world_size()) - save_validation_info_outputs = save_validation_info_outputs and ( + SAVE_VALIDATION_INFO_OUTPUTS = SAVE_VALIDATION_INFO_OUTPUTS and ( dist.get_rank() == 0 ) From 15cf522c967028019cfb050572a2187ab5ae493a Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Wed, 8 Oct 2025 10:35:34 +0000 Subject: [PATCH 19/22] Rebase fixes, linting Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 61 +++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index e04e2b03..7bc82b09 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -132,7 +132,7 @@ if USE_MICRO_MODELS: VALIDATION_INFO_DIR = os.path.join(VALIDATION_INFO_DIR, "tiny_models") -# pass custom model path list for eg: EXPORT FMS_TESTING_COMMON_MODEL_PATHS="/tmp/models/granite-3-8b-base,/tmp/models/granite-7b-base" +# pass custom model path list for eg: EXPORT FMS_TEST_SHAPES_COMMON_MODEL_PATHS="/tmp/models/granite-3-8b-base,/tmp/models/granite-7b-base" if isinstance(COMMON_MODEL_PATHS, str): COMMON_MODEL_PATHS = COMMON_MODEL_PATHS.split(",") @@ -185,7 +185,7 @@ ] ) os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(max(COMMON_BATCH_SIZES), 2)) - + fx_config.backed_size_oblivious = True # thresholds are chosen based on 1024 tokens per sequence # 1% error threshold rate between cpu fp32 and cuda fp16 @@ -220,22 +220,22 @@ ) USE_MICRO_MODELS = False COMMON_MODEL_PATHS = [] - frequency = int(MODEL_CONFIGURATION_FREQUENCY) + FREQUENCY = int(MODEL_CONFIGURATION_FREQUENCY) with open(MODEL_CONFIGURATION_PATH, "r") as f: for line in f: try: - model_config = json.loads(line) - if model_config["frequency"] <= frequency: - COMMON_MODEL_PATHS.append(model_config["model_id"]) + MODEL_CONFIG = json.loads(line) + if MODEL_CONFIG["frequency"] <= FREQUENCY: + COMMON_MODEL_PATHS.append(MODEL_CONFIG["model_id"]) # assume fullsize models - FAIL_THRESHOLDS[(model_config["model_id"], USE_MICRO_MODELS)] = ( - model_config["ce"], - model_config["mean_diff"], + FAIL_THRESHOLDS[(MODEL_CONFIG["model_id"], USE_MICRO_MODELS)] = ( + MODEL_CONFIG["ce"], + MODEL_CONFIG["mean_diff"], ) except json.JSONDecodeError: print(f"config contained an improper json line: {line.strip()}") -common_shapes = list( +COMMON_SHAPES = list( itertools.product( COMMON_MODEL_PATHS, COMMON_BATCH_SIZES, @@ -308,7 +308,7 @@ def __maybe_get_gptq_kwargs(model_path): return gptq_kwargs_aiu, gptq_kwargs_cpu -def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): +def __prepare_inputs(batch_size, seq_length, tokenizer, model_path, seed=0): if "paged" in ATTN_NAME: prompts_and_sizes = sample_sharegpt_requests( SHARE_GPT_DATASET_PATH, @@ -486,7 +486,10 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): def _check_failure_thresholds( - diff_fail_responses_list, ce_fail_responses_list, total_tokens + diff_fail_responses_list, + ce_fail_responses_list, + total_tokens, + record_property=None, ): # test the failure rates for across all tokens diff_failure_rate = len(diff_fail_responses_list) / total_tokens @@ -494,9 +497,10 @@ def _check_failure_thresholds( dprint(f"mean diff failure rate: {diff_failure_rate}") dprint(f"cross entropy loss failure rate: {ce_failure_rate}") - # Add failure rates to xml report - record_property("mean_diff_failure_rate", diff_failure_rate) - record_property("cross_entropy_loss_failure_rate", ce_failure_rate) + if record_property is not None: + # Add failure rates to xml report + record_property("mean_diff_failure_rate", diff_failure_rate) + record_property("cross_entropy_loss_failure_rate", ce_failure_rate) if "mean_diff" not in SKIP_ASSERTIONS: assert diff_failure_rate < FAILURE_RATE_THRESHOLD, ( @@ -565,7 +569,6 @@ def _get_cpu_model(is_gptq, is_fp8, micro_model_state_dict=None, **kwargs): return validation_model - def _get_device_validation_information( model_path, batch_size, @@ -578,7 +581,6 @@ def _get_device_validation_information( token_iter, device="aiu", tokenizer=None, - only_last_token=None, ): # For CPU, we try to load it from disk first if it exists if device == "cpu": @@ -589,7 +591,7 @@ def _get_device_validation_information( max_new_tokens, tokenizer, token_iter, - ATTN_NAME, # TODO checkme + ATTN_NAME, ) if cpu_validation_info is not None: @@ -600,8 +602,7 @@ def _get_device_validation_information( if device == "cpu": device_dependent_kwargs["attn_algorithm"] = "math" - if device == "aiu" and only_last_token is not None: - device_dependent_kwargs["only_last_token"] = only_last_token + if device == "aiu": device_dependent_kwargs["last_n_tokens"] = 64 if "paged" in ATTN_NAME else 1 # Otherwise we need to get the AIU / CPU validation info @@ -623,7 +624,7 @@ def _get_device_validation_information( validation_info.save( get_validation_info_path( - validation_info_dir, + VALIDATION_INFO_DIR, model_path, batch_size, seq_length, @@ -703,7 +704,6 @@ def _run_validation_level_0( token_iter=0, device="aiu", tokenizer=tokenizer, - only_last_token="paged" not in ATTN_NAME, ) dprint("aiu validation info extracted for validation level 0") @@ -733,6 +733,7 @@ def _run_validation_level_1( model, micro_model_path, validation_zero_info, + record_property, ): iters = int(CUMULATIVE_TEST_TOKENS_PER_SEQUENCE) // max_new_tokens ce_fail_responses_list = [] @@ -781,7 +782,6 @@ def _run_validation_level_1( token_iter=i, device="aiu", tokenizer=tokenizer, - only_last_token=ATTN_TYPE != "paged", ) dprint(f"aiu validation info extracted for validation level 1 - iter={i}") @@ -810,7 +810,10 @@ def _run_validation_level_1( total_tokens += len(level_1_metrics) _check_failure_thresholds( - diff_fail_responses_list, ce_fail_responses_list, total_tokens + diff_fail_responses_list, + ce_fail_responses_list, + total_tokens, + record_property, ) @@ -823,12 +826,15 @@ def _run_cpu_aiu_validation_test( cpu_model, aiu_model, micro_model_path, + record_property, ): # Get the tokenizer and AIU / CPU models to compare tokenizer = AutoTokenizer.from_pretrained(model_path) # prepare input_ids - input_ids, extra_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) + input_ids, extra_kwargs = __prepare_inputs( + batch_size, seq_length, tokenizer, model_path + ) extra_kwargs["attn_name"] = ATTN_NAME if ( @@ -875,11 +881,12 @@ def _run_cpu_aiu_validation_test( aiu_model, micro_model_path, validation_zero_info, + record_property, ) @pytest.mark.parametrize( - "model_path,batch_size,seq_length,max_new_tokens", common_shapes + "model_path,batch_size,seq_length,max_new_tokens", COMMON_SHAPES ) def test_common_shapes( model_path, @@ -900,7 +907,6 @@ def test_common_shapes( # we don't currently support inferring gptq from get_model, so we must use an adapter with hf_configured gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path) - is_gptq = len(gptq_kwargs_aiu) != 0 is_fp8 = "fp8" in ATTN_NAME model_kwargs = _get_common_model_kwargs(is_gptq, model_path) @@ -926,4 +932,5 @@ def test_common_shapes( validation_model, model, micro_model_path, + record_property, ) From a743968bd066631b3af1a56d5a1abad5b8d78c03 Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Wed, 8 Oct 2025 10:54:37 +0000 Subject: [PATCH 20/22] fix input prep Signed-off-by: Alex-Brooks --- tests/models/test_decoders.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index 7bc82b09..4f95e61e 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -742,6 +742,9 @@ def _run_validation_level_1( for i in range(iters): # for iteration 0, we have computed the cpu validation info in the prior step for seed=0, so skip if i != 0: + input_ids, extra_kwargs = __prepare_inputs( + batch_size, seq_length, tokenizer, model_path, seed=i + ) cpu_validation_info = _get_device_validation_information( model_path=model_path, batch_size=batch_size, From b25e44f5d0a1e0d82752a4b65219dee406faac4e Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Wed, 8 Oct 2025 19:29:22 +0000 Subject: [PATCH 21/22] use setdefault for torch sendnn cache dir Signed-off-by: Alex-Brooks --- tests/models/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/conftest.py b/tests/models/conftest.py index 29fac434..e2d06176 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -25,7 +25,7 @@ def pytest_sessionstart(session): # NOTE: we should configure the cachedir before importing torchsendnn's # graph cache to prevent it from being initialized in the wrong place. - os.environ["TORCH_SENDNN_CACHE_DIR"] = os.path.join(os.getcwd(), ".cache") + os.environ.setdefault("TORCH_SENDNN_CACHE_DIR", os.path.join(os.getcwd(), ".cache")) def pytest_addoption(parser): From 4609ad5980aceab297825fa8e1408fa778e76c5d Mon Sep 17 00:00:00 2001 From: "Rashed Z. Bhatti, PhD" Date: Thu, 16 Oct 2025 16:01:39 +0000 Subject: [PATCH 22/22] Added head_dim override option to inference.py Signed-off-by: Rashed Z. Bhatti, PhD --- scripts/inference.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/inference.py b/scripts/inference.py index 3ec33f0e..78919673 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -20,6 +20,7 @@ from fms.models.llama import LLaMAConfig, _llama_factory_factory from fms.utils import generation from fms.utils.generation import pad_input_ids +from fms.utils import serialization from transformers import AutoTokenizer @@ -257,6 +258,15 @@ default=0, help="Timeout to use for messaging in minutes. Default set by PyTorch dist.init_process_group", ) + +parser.add_argument( + "--head_dim", + type=int, + default=None, + help="Override the head_dim in the model config", +) + + args = parser.parse_args() attention_map = { @@ -504,6 +514,12 @@ def select_int8_module( dprint(f"data_type={default_dtype}") dprint("=" * 60 + "\n") +if args.device_type == "aiu" and args.head_dim is not None: + serialization.extend_adapter( + "granite", "hf", ["weight_expansion_for_mismatched_head_dim"] + ) + + with stagger_region(args.stagger_load): model = get_model( args.architecture, @@ -516,6 +532,10 @@ def select_int8_module( group=dist.group.WORLD, linear_config=linear_config, fused_weights=fused_weights, + override_hf_pretrained_config=True + if args.device_type == "aiu" and args.head_dim is not None + else False, + head_dim=args.head_dim, ) ### Quantization