5050from fastdeploy .model_executor .models .ernie4_5_moe import Ernie4_5_Attention
5151from fastdeploy .model_executor .ops .gpu import get_padding_offset
5252
53- os .environ .setdefault ("DG_NVCC_OVERRIDE_CPP_STANDARD" , "17" )
53+ if "nvidia graphics device" in paddle .device .cuda .get_device_name ().lower ():
54+ # (ZKK): CI machine.
55+ os .environ .setdefault ("DG_NVCC_OVERRIDE_CPP_STANDARD" , "17" )
5456
5557
5658class TestAttentionPerformance (unittest .TestCase ):
@@ -119,10 +121,10 @@ def create_model_config_json(self) -> str:
119121 "dtype" : "bfloat16" ,
120122 "hidden_size" : 4096 ,
121123 "max_position_embeddings" : 131072 ,
122- "max_model_len" : 2 * ( 9000 + 128 ) ,
124+ "max_model_len" : 5500 ,
123125 "num_attention_heads" : 32 ,
124126 "num_key_value_heads" : 4 ,
125- "num_hidden_layers" : 39 ,
127+ "num_hidden_layers" : 5 ,
126128 }
127129 model_dir = tempfile .mkdtemp (prefix = "tmp_model_config_" )
128130 config_path = os .path .join (model_dir , "config.json" )
@@ -293,7 +295,6 @@ def test_decode_performance_with_prefill(self):
293295 # Test parameters
294296 test_steps = 100
295297 prefill_batch_size = 1
296- decode_batch_size = 100 # This can be configured as needed
297298 prefill_seq_len = 4096
298299 use_dynamic_quant = True
299300 act_tensor_dtype = paddle .bfloat16
@@ -317,8 +318,7 @@ def test_decode_performance_with_prefill(self):
317318
318319 paddle .device .synchronize ()
319320
320- import paddle .profiler as profiler
321-
321+ # import paddle.profiler as profiler
322322 # p = profiler.Profiler(
323323 # targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
324324 # on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
@@ -341,56 +341,57 @@ def test_decode_performance_with_prefill(self):
341341
342342 # p.stop()
343343
344- decode_hidden_states = paddle .randn (
345- [decode_batch_size , self .fd_config .model_config .hidden_size ], dtype = act_tensor_dtype
346- )
344+ # p = profiler.Profiler(
345+ # targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
346+ # on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
347+ # )
347348
348- forward_meta = self .create_forward_meta (
349- batch_size = decode_batch_size ,
350- seq_len = 5000 ,
351- mode = ForwardMode .DECODE ,
352- fd_config = self .fd_config ,
353- attn_backend = self .attn_backend ,
354- use_dynamic_quant = use_dynamic_quant ,
355- )
349+ # p.start()
350+ # p.step()
356351
357- self .attn_backend .init_attention_metadata (forward_meta )
352+ for decode_batch_size in [10 , 20 , 40 , 60 , 80 , 100 , 128 ]:
353+ decode_hidden_states = paddle .randn (
354+ [decode_batch_size , self .fd_config .model_config .hidden_size ], dtype = act_tensor_dtype
355+ )
358356
359- p = profiler .Profiler (
360- targets = [profiler .ProfilerTarget .CPU , profiler .ProfilerTarget .GPU ],
361- on_trace_ready = profiler .export_chrome_tracing ("./profile_log" ),
362- )
357+ forward_meta = self .create_forward_meta (
358+ batch_size = decode_batch_size ,
359+ seq_len = 5000 ,
360+ mode = ForwardMode .DECODE ,
361+ fd_config = self .fd_config ,
362+ attn_backend = self .attn_backend ,
363+ use_dynamic_quant = use_dynamic_quant ,
364+ )
363365
364- p .start ()
365- p .step ()
366+ self .attn_backend .init_attention_metadata (forward_meta )
366367
367- paddle .device .synchronize ()
368+ paddle .device .synchronize ()
368369
369- # 必须要先预热一次!因为预处理被放到了第一层再做了!
370- self .attn_forward (forward_meta , decode_hidden_states )
370+ # 必须要先预热一次!因为预处理被放到了第一层再做了!
371+ self .attn_forward (forward_meta , decode_hidden_states )
371372
372- attn_cuda_graphs = graphs .CUDAGraph ()
373- attn_cuda_graphs .capture_begin ()
373+ attn_cuda_graphs = graphs .CUDAGraph ()
374+ attn_cuda_graphs .capture_begin ()
374375
375- self .attn_forward (forward_meta , decode_hidden_states )
376+ self .attn_forward (forward_meta , decode_hidden_states )
376377
377- attn_cuda_graphs .capture_end ()
378+ attn_cuda_graphs .capture_end ()
378379
379- start_events = [paddle .device .cuda .Event (enable_timing = True ) for _ in range (test_steps )]
380- end_events = [paddle .device .cuda .Event (enable_timing = True ) for _ in range (test_steps )]
381- for i in range (test_steps ):
382- start_events [i ].record ()
380+ start_events = [paddle .device .cuda .Event (enable_timing = True ) for _ in range (test_steps )]
381+ end_events = [paddle .device .cuda .Event (enable_timing = True ) for _ in range (test_steps )]
382+ for i in range (test_steps ):
383+ start_events [i ].record ()
383384
384- attn_cuda_graphs .replay ()
385- # self.attn_forward(forward_meta, decode_hidden_states)
385+ attn_cuda_graphs .replay ()
386+ # self.attn_forward(forward_meta, decode_hidden_states)
386387
387- end_events [i ].record ()
388- paddle .device .synchronize ()
388+ end_events [i ].record ()
389+ paddle .device .synchronize ()
389390
390- times = np .array ([round (s .elapsed_time (e ), 1 ) for s , e in zip (start_events , end_events )])[1 :]
391- print (times [- 5 :])
391+ times = np .array ([round (s .elapsed_time (e ), 1 ) for s , e in zip (start_events , end_events )])[1 :]
392+ print (times [- 5 :])
392393
393- p .stop ()
394+ # p.stop()
394395
395396
396397if __name__ == "__main__" :
0 commit comments