|
11 | 11 | from fms.models import get_model |
12 | 12 | from fms.utils.generation import generate |
13 | 13 |
|
| 14 | +from transformers import AutoModelForCausalLM, AutoTokenizer |
| 15 | + |
14 | 16 | from aiu_fms_testing_utils.testing.validation import get_default_validation_prefix |
15 | 17 |
|
16 | 18 | from aiu_fms_testing_utils.utils import prepare_inputs |
@@ -149,22 +151,27 @@ def __infer_layer(model, max_len, device, max_new_tokens, batch_size, tokenizer) |
149 | 151 |
|
150 | 152 | if "generate" in mode: |
151 | 153 | with torch.no_grad(): |
152 | | - result = generate( |
153 | | - model, |
154 | | - ids, |
155 | | - max_new_tokens=max_new_tokens, |
156 | | - use_cache=use_cache, |
157 | | - do_sample=do_sample, |
158 | | - max_seq_len=max_seq_len, |
159 | | - timing="e2e", |
160 | | - eos_token_id=None, |
161 | | - contiguous_cache=True, |
162 | | - extra_kwargs={}, |
163 | | - ) |
164 | | - result, timings = result |
165 | | - logger.info(f"Generation completed: Result len is {len(result)}") |
166 | | - if len(result.shape) == 1: |
167 | | - result = result.unsqueeze(0) |
| 154 | + # result = generate( |
| 155 | + # model, |
| 156 | + # ids, |
| 157 | + # max_new_tokens=max_new_tokens, |
| 158 | + # use_cache=use_cache, |
| 159 | + # do_sample=do_sample, |
| 160 | + # max_seq_len=max_seq_len, |
| 161 | + # timing="e2e", |
| 162 | + # eos_token_id=None, |
| 163 | + # contiguous_cache=True, |
| 164 | + # extra_kwargs={}, |
| 165 | + # ) |
| 166 | + # result, timings = result |
| 167 | + # logger.info(f"Generation completed: Result len is {len(result)}") |
| 168 | + # if len(result.shape) == 1: |
| 169 | + # result = result.unsqueeze(0) |
| 170 | + model.generate(**ids, |
| 171 | + max_length=max_seq_len, |
| 172 | + max_new_tokens=max_new_token, |
| 173 | + do_sample=do_sample, |
| 174 | + use_cache=use_cache) |
168 | 175 | else: |
169 | 176 | result = model.forward( |
170 | 177 | ids, |
@@ -334,21 +341,36 @@ def generate_layers_metrics(model_path, batch_size, seq_length, max_new_tokens): |
334 | 341 |
|
335 | 342 | tokenizer = tokenizers.get_tokenizer(model_path) |
336 | 343 |
|
337 | | - # prepare the cpu model |
338 | | - validation_model = get_model( |
339 | | - device_type="cpu", |
340 | | - data_type=torch.float32, |
341 | | - fused_weights=False, |
342 | | - **get_model_kwargs, |
343 | | - ) |
| 344 | + device = "auto" |
| 345 | + model_path = "ibm-granite/granite-3.3-8b-base" |
| 346 | + tokenizer = AutoTokenizer.from_pretrained(model_path) |
344 | 347 |
|
345 | | - # prepare the cuda model |
346 | | - validation_model_cuda = get_model( |
347 | | - device_type="cuda", |
348 | | - data_type=torch.float16, |
349 | | - fused_weights=False, |
350 | | - **get_model_kwargs, |
351 | | - ) |
| 348 | + # drop device_map if running on CPU |
| 349 | + validation_model = AutoModelForCausalLM.from_pretrained(model_path, |
| 350 | + device_map="cpu", |
| 351 | + torch_dtype=torch.float32 |
| 352 | + ) |
| 353 | + |
| 354 | + validation_model_cuda = AutoModelForCausalLM.from_pretrained(model_path, |
| 355 | + device_map="cuda", |
| 356 | + torch_dtype=torch.float16 |
| 357 | + ) |
| 358 | + |
| 359 | + # prepare the cpu model |
| 360 | + # validation_model = get_model( |
| 361 | + # device_type="cpu", |
| 362 | + # data_type=torch.float32, |
| 363 | + # fused_weights=False, |
| 364 | + # **get_model_kwargs, |
| 365 | + # ) |
| 366 | + |
| 367 | + # # prepare the cuda model |
| 368 | + # validation_model_cuda = get_model( |
| 369 | + # device_type="cuda", |
| 370 | + # data_type=torch.float16, |
| 371 | + # fused_weights=False, |
| 372 | + # **get_model_kwargs, |
| 373 | + # ) |
352 | 374 |
|
353 | 375 | layer_stack_cpu = __register_call_layers(model=validation_model, |
354 | 376 | batch_size=batch_size, |
|
0 commit comments