Skip to content

Commit 894ebd6

Browse files
committed
Adds arg parse for loader mode
Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
1 parent db9f217 commit 894ebd6

File tree

1 file changed

+62
-53
lines changed

1 file changed

+62
-53
lines changed

scripts/generate_layers_metrics.py

Lines changed: 62 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@
4949
required=True,
5050
help="Sets the output generation mode."
5151
)
52+
parser.add_argument(
53+
"--model_loader",
54+
choices=["fms", "hf"],
55+
default="fms",
56+
required=True,
57+
help="Which model loader/runner to be used; fms - IBM's Foundation Model Stack or hf - HuggingFace Transformers."
58+
)
5259
parser.add_argument(
5360
"--batch_sizes",
5461
type=str,
@@ -136,6 +143,7 @@ def __infer_layer(model, max_len, device, max_new_tokens, batch_size, tokenizer)
136143

137144
do_sample = False
138145
use_cache = True
146+
result = None
139147

140148
prompts = prepare_inputs(batch_size, max_len, tokenizer, sharegpt_path)
141149
ids, pad_input_ids = prompts
@@ -151,27 +159,29 @@ def __infer_layer(model, max_len, device, max_new_tokens, batch_size, tokenizer)
151159

152160
if "generate" in mode:
153161
with torch.no_grad():
154-
# result = generate(
155-
# model,
156-
# ids,
157-
# max_new_tokens=max_new_tokens,
158-
# use_cache=use_cache,
159-
# do_sample=do_sample,
160-
# max_seq_len=max_seq_len,
161-
# timing="e2e",
162-
# eos_token_id=None,
163-
# contiguous_cache=True,
164-
# extra_kwargs={},
165-
# )
166-
# result, timings = result
167-
# logger.info(f"Generation completed: Result len is {len(result)}")
168-
# if len(result.shape) == 1:
169-
# result = result.unsqueeze(0)
170-
model.generate(**ids,
171-
max_length=max_seq_len,
172-
max_new_tokens=max_new_token,
173-
do_sample=do_sample,
174-
use_cache=use_cache)
162+
if args.model_loader == "fms":
163+
result = generate(
164+
model,
165+
ids,
166+
max_new_tokens=max_new_tokens,
167+
use_cache=use_cache,
168+
do_sample=do_sample,
169+
max_seq_len=max_seq_len,
170+
timing="e2e",
171+
eos_token_id=None,
172+
contiguous_cache=True,
173+
extra_kwargs={},
174+
)
175+
if args.model_loader == "hf":
176+
result = model.generate(ids,
177+
max_length=max_len,
178+
max_new_tokens=max_new_token,
179+
do_sample=do_sample,
180+
use_cache=use_cache)
181+
result, timings = result
182+
logger.info(f"Generation completed: Result len is {len(result)}")
183+
if len(result.shape) == 1:
184+
result = result.unsqueeze(0)
175185
else:
176186
result = model.forward(
177187
ids,
@@ -339,38 +349,37 @@ def generate_layers_metrics(model_path, batch_size, seq_length, max_new_tokens):
339349
**micro_model_kwargs,
340350
}
341351

342-
tokenizer = tokenizers.get_tokenizer(model_path)
343-
344-
device = "auto"
345-
model_path = "ibm-granite/granite-3.3-8b-base"
346-
tokenizer = AutoTokenizer.from_pretrained(model_path)
347-
348-
# drop device_map if running on CPU
349-
validation_model = AutoModelForCausalLM.from_pretrained(model_path,
350-
device_map="cpu",
351-
torch_dtype=torch.float32
352-
)
353-
354-
validation_model_cuda = AutoModelForCausalLM.from_pretrained(model_path,
355-
device_map="cuda",
356-
torch_dtype=torch.float16
357-
)
358-
359-
# prepare the cpu model
360-
# validation_model = get_model(
361-
# device_type="cpu",
362-
# data_type=torch.float32,
363-
# fused_weights=False,
364-
# **get_model_kwargs,
365-
# )
366-
367-
# # prepare the cuda model
368-
# validation_model_cuda = get_model(
369-
# device_type="cuda",
370-
# data_type=torch.float16,
371-
# fused_weights=False,
372-
# **get_model_kwargs,
373-
# )
352+
if args.model_loader == "hf":
353+
tokenizer = AutoTokenizer.from_pretrained(model_path)
354+
355+
# prepare the cpu model
356+
validation_model = AutoModelForCausalLM.from_pretrained(model_path,
357+
device_map="cpu",
358+
torch_dtype=torch.float32
359+
)
360+
# prepare the cuda model
361+
validation_model_cuda = AutoModelForCausalLM.from_pretrained(model_path,
362+
device_map="cuda",
363+
torch_dtype=torch.float16
364+
)
365+
if args.model_loader == "fms":
366+
tokenizer = tokenizers.get_tokenizer(model_path)
367+
368+
# prepare the cpu model
369+
validation_model = get_model(
370+
device_type="cpu",
371+
data_type=torch.float32,
372+
fused_weights=False,
373+
**get_model_kwargs,
374+
)
375+
376+
# prepare the cuda model
377+
validation_model_cuda = get_model(
378+
device_type="cuda",
379+
data_type=torch.float16,
380+
fused_weights=False,
381+
**get_model_kwargs,
382+
)
374383

375384
layer_stack_cpu = __register_call_layers(model=validation_model,
376385
batch_size=batch_size,

0 commit comments

Comments
 (0)