4949 required = True ,
5050 help = "Sets the output generation mode."
5151)
52+ parser .add_argument (
53+ "--model_loader" ,
54+ choices = ["fms" , "hf" ],
55+ default = "fms" ,
56+ required = True ,
57+ help = "Which model loader/runner to be used; fms - IBM's Foundation Model Stack or hf - HuggingFace Transformers."
58+ )
5259parser .add_argument (
5360 "--batch_sizes" ,
5461 type = str ,
@@ -136,6 +143,7 @@ def __infer_layer(model, max_len, device, max_new_tokens, batch_size, tokenizer)
136143
137144 do_sample = False
138145 use_cache = True
146+ result = None
139147
140148 prompts = prepare_inputs (batch_size , max_len , tokenizer , sharegpt_path )
141149 ids , pad_input_ids = prompts
@@ -151,27 +159,29 @@ def __infer_layer(model, max_len, device, max_new_tokens, batch_size, tokenizer)
151159
152160 if "generate" in mode :
153161 with torch .no_grad ():
154- # result = generate(
155- # model,
156- # ids,
157- # max_new_tokens=max_new_tokens,
158- # use_cache=use_cache,
159- # do_sample=do_sample,
160- # max_seq_len=max_seq_len,
161- # timing="e2e",
162- # eos_token_id=None,
163- # contiguous_cache=True,
164- # extra_kwargs={},
165- # )
166- # result, timings = result
167- # logger.info(f"Generation completed: Result len is {len(result)}")
168- # if len(result.shape) == 1:
169- # result = result.unsqueeze(0)
170- model .generate (** ids ,
171- max_length = max_seq_len ,
172- max_new_tokens = max_new_token ,
173- do_sample = do_sample ,
174- use_cache = use_cache )
162+ if args .model_loader == "fms" :
163+ result = generate (
164+ model ,
165+ ids ,
166+ max_new_tokens = max_new_tokens ,
167+ use_cache = use_cache ,
168+ do_sample = do_sample ,
169+ max_seq_len = max_seq_len ,
170+ timing = "e2e" ,
171+ eos_token_id = None ,
172+ contiguous_cache = True ,
173+ extra_kwargs = {},
174+ )
175+ if args .model_loader == "hf" :
176+ result = model .generate (ids ,
177+ max_length = max_len ,
178+ max_new_tokens = max_new_token ,
179+ do_sample = do_sample ,
180+ use_cache = use_cache )
181+ result , timings = result
182+ logger .info (f"Generation completed: Result len is { len (result )} " )
183+ if len (result .shape ) == 1 :
184+ result = result .unsqueeze (0 )
175185 else :
176186 result = model .forward (
177187 ids ,
@@ -339,38 +349,37 @@ def generate_layers_metrics(model_path, batch_size, seq_length, max_new_tokens):
339349 ** micro_model_kwargs ,
340350 }
341351
342- tokenizer = tokenizers .get_tokenizer (model_path )
343-
344- device = "auto"
345- model_path = "ibm-granite/granite-3.3-8b-base"
346- tokenizer = AutoTokenizer .from_pretrained (model_path )
347-
348- # drop device_map if running on CPU
349- validation_model = AutoModelForCausalLM .from_pretrained (model_path ,
350- device_map = "cpu" ,
351- torch_dtype = torch .float32
352- )
353-
354- validation_model_cuda = AutoModelForCausalLM .from_pretrained (model_path ,
355- device_map = "cuda" ,
356- torch_dtype = torch .float16
357- )
358-
359- # prepare the cpu model
360- # validation_model = get_model(
361- # device_type="cpu",
362- # data_type=torch.float32,
363- # fused_weights=False,
364- # **get_model_kwargs,
365- # )
366-
367- # # prepare the cuda model
368- # validation_model_cuda = get_model(
369- # device_type="cuda",
370- # data_type=torch.float16,
371- # fused_weights=False,
372- # **get_model_kwargs,
373- # )
352+ if args .model_loader == "hf" :
353+ tokenizer = AutoTokenizer .from_pretrained (model_path )
354+
355+ # prepare the cpu model
356+ validation_model = AutoModelForCausalLM .from_pretrained (model_path ,
357+ device_map = "cpu" ,
358+ torch_dtype = torch .float32
359+ )
360+ # prepare the cuda model
361+ validation_model_cuda = AutoModelForCausalLM .from_pretrained (model_path ,
362+ device_map = "cuda" ,
363+ torch_dtype = torch .float16
364+ )
365+ if args .model_loader == "fms" :
366+ tokenizer = tokenizers .get_tokenizer (model_path )
367+
368+ # prepare the cpu model
369+ validation_model = get_model (
370+ device_type = "cpu" ,
371+ data_type = torch .float32 ,
372+ fused_weights = False ,
373+ ** get_model_kwargs ,
374+ )
375+
376+ # prepare the cuda model
377+ validation_model_cuda = get_model (
378+ device_type = "cuda" ,
379+ data_type = torch .float16 ,
380+ fused_weights = False ,
381+ ** get_model_kwargs ,
382+ )
374383
375384 layer_stack_cpu = __register_call_layers (model = validation_model ,
376385 batch_size = batch_size ,
0 commit comments