Skip to content

Commit db9f217

Browse files
committed
hf_mode wip
Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
1 parent 858a51e commit db9f217

File tree

1 file changed

+52
-30
lines changed

1 file changed

+52
-30
lines changed

scripts/generate_layers_metrics.py

Lines changed: 52 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from fms.models import get_model
1212
from fms.utils.generation import generate
1313

14+
from transformers import AutoModelForCausalLM, AutoTokenizer
15+
1416
from aiu_fms_testing_utils.testing.validation import get_default_validation_prefix
1517

1618
from aiu_fms_testing_utils.utils import prepare_inputs
@@ -149,22 +151,27 @@ def __infer_layer(model, max_len, device, max_new_tokens, batch_size, tokenizer)
149151

150152
if "generate" in mode:
151153
with torch.no_grad():
152-
result = generate(
153-
model,
154-
ids,
155-
max_new_tokens=max_new_tokens,
156-
use_cache=use_cache,
157-
do_sample=do_sample,
158-
max_seq_len=max_seq_len,
159-
timing="e2e",
160-
eos_token_id=None,
161-
contiguous_cache=True,
162-
extra_kwargs={},
163-
)
164-
result, timings = result
165-
logger.info(f"Generation completed: Result len is {len(result)}")
166-
if len(result.shape) == 1:
167-
result = result.unsqueeze(0)
154+
# result = generate(
155+
# model,
156+
# ids,
157+
# max_new_tokens=max_new_tokens,
158+
# use_cache=use_cache,
159+
# do_sample=do_sample,
160+
# max_seq_len=max_seq_len,
161+
# timing="e2e",
162+
# eos_token_id=None,
163+
# contiguous_cache=True,
164+
# extra_kwargs={},
165+
# )
166+
# result, timings = result
167+
# logger.info(f"Generation completed: Result len is {len(result)}")
168+
# if len(result.shape) == 1:
169+
# result = result.unsqueeze(0)
170+
model.generate(**ids,
171+
max_length=max_seq_len,
172+
max_new_tokens=max_new_token,
173+
do_sample=do_sample,
174+
use_cache=use_cache)
168175
else:
169176
result = model.forward(
170177
ids,
@@ -334,21 +341,36 @@ def generate_layers_metrics(model_path, batch_size, seq_length, max_new_tokens):
334341

335342
tokenizer = tokenizers.get_tokenizer(model_path)
336343

337-
# prepare the cpu model
338-
validation_model = get_model(
339-
device_type="cpu",
340-
data_type=torch.float32,
341-
fused_weights=False,
342-
**get_model_kwargs,
343-
)
344+
device = "auto"
345+
model_path = "ibm-granite/granite-3.3-8b-base"
346+
tokenizer = AutoTokenizer.from_pretrained(model_path)
344347

345-
# prepare the cuda model
346-
validation_model_cuda = get_model(
347-
device_type="cuda",
348-
data_type=torch.float16,
349-
fused_weights=False,
350-
**get_model_kwargs,
351-
)
348+
# drop device_map if running on CPU
349+
validation_model = AutoModelForCausalLM.from_pretrained(model_path,
350+
device_map="cpu",
351+
torch_dtype=torch.float32
352+
)
353+
354+
validation_model_cuda = AutoModelForCausalLM.from_pretrained(model_path,
355+
device_map="cuda",
356+
torch_dtype=torch.float16
357+
)
358+
359+
# prepare the cpu model
360+
# validation_model = get_model(
361+
# device_type="cpu",
362+
# data_type=torch.float32,
363+
# fused_weights=False,
364+
# **get_model_kwargs,
365+
# )
366+
367+
# # prepare the cuda model
368+
# validation_model_cuda = get_model(
369+
# device_type="cuda",
370+
# data_type=torch.float16,
371+
# fused_weights=False,
372+
# **get_model_kwargs,
373+
# )
352374

353375
layer_stack_cpu = __register_call_layers(model=validation_model,
354376
batch_size=batch_size,

0 commit comments

Comments
 (0)