Skip to content

Commit 6584ee9

Browse files
[unitest]clean code (#5094)
1 parent d112353 commit 6584ee9

File tree

3 files changed

+74
-70
lines changed

3 files changed

+74
-70
lines changed

tests/layers/test_attention_layer.py

Lines changed: 43 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@
5050
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_Attention
5151
from fastdeploy.model_executor.ops.gpu import get_padding_offset
5252

53-
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")
53+
if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower():
54+
# (ZKK): CI machine.
55+
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")
5456

5557

5658
class TestAttentionPerformance(unittest.TestCase):
@@ -119,10 +121,10 @@ def create_model_config_json(self) -> str:
119121
"dtype": "bfloat16",
120122
"hidden_size": 4096,
121123
"max_position_embeddings": 131072,
122-
"max_model_len": 2 * (9000 + 128),
124+
"max_model_len": 5500,
123125
"num_attention_heads": 32,
124126
"num_key_value_heads": 4,
125-
"num_hidden_layers": 39,
127+
"num_hidden_layers": 5,
126128
}
127129
model_dir = tempfile.mkdtemp(prefix="tmp_model_config_")
128130
config_path = os.path.join(model_dir, "config.json")
@@ -293,7 +295,6 @@ def test_decode_performance_with_prefill(self):
293295
# Test parameters
294296
test_steps = 100
295297
prefill_batch_size = 1
296-
decode_batch_size = 100 # This can be configured as needed
297298
prefill_seq_len = 4096
298299
use_dynamic_quant = True
299300
act_tensor_dtype = paddle.bfloat16
@@ -317,8 +318,7 @@ def test_decode_performance_with_prefill(self):
317318

318319
paddle.device.synchronize()
319320

320-
import paddle.profiler as profiler
321-
321+
# import paddle.profiler as profiler
322322
# p = profiler.Profiler(
323323
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
324324
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
@@ -341,56 +341,57 @@ def test_decode_performance_with_prefill(self):
341341

342342
# p.stop()
343343

344-
decode_hidden_states = paddle.randn(
345-
[decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype
346-
)
344+
# p = profiler.Profiler(
345+
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
346+
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
347+
# )
347348

348-
forward_meta = self.create_forward_meta(
349-
batch_size=decode_batch_size,
350-
seq_len=5000,
351-
mode=ForwardMode.DECODE,
352-
fd_config=self.fd_config,
353-
attn_backend=self.attn_backend,
354-
use_dynamic_quant=use_dynamic_quant,
355-
)
349+
# p.start()
350+
# p.step()
356351

357-
self.attn_backend.init_attention_metadata(forward_meta)
352+
for decode_batch_size in [10, 20, 40, 60, 80, 100, 128]:
353+
decode_hidden_states = paddle.randn(
354+
[decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype
355+
)
358356

359-
p = profiler.Profiler(
360-
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
361-
on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
362-
)
357+
forward_meta = self.create_forward_meta(
358+
batch_size=decode_batch_size,
359+
seq_len=5000,
360+
mode=ForwardMode.DECODE,
361+
fd_config=self.fd_config,
362+
attn_backend=self.attn_backend,
363+
use_dynamic_quant=use_dynamic_quant,
364+
)
363365

364-
p.start()
365-
p.step()
366+
self.attn_backend.init_attention_metadata(forward_meta)
366367

367-
paddle.device.synchronize()
368+
paddle.device.synchronize()
368369

369-
# 必须要先预热一次!因为预处理被放到了第一层再做了!
370-
self.attn_forward(forward_meta, decode_hidden_states)
370+
# 必须要先预热一次!因为预处理被放到了第一层再做了!
371+
self.attn_forward(forward_meta, decode_hidden_states)
371372

372-
attn_cuda_graphs = graphs.CUDAGraph()
373-
attn_cuda_graphs.capture_begin()
373+
attn_cuda_graphs = graphs.CUDAGraph()
374+
attn_cuda_graphs.capture_begin()
374375

375-
self.attn_forward(forward_meta, decode_hidden_states)
376+
self.attn_forward(forward_meta, decode_hidden_states)
376377

377-
attn_cuda_graphs.capture_end()
378+
attn_cuda_graphs.capture_end()
378379

379-
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
380-
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
381-
for i in range(test_steps):
382-
start_events[i].record()
380+
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
381+
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
382+
for i in range(test_steps):
383+
start_events[i].record()
383384

384-
attn_cuda_graphs.replay()
385-
# self.attn_forward(forward_meta, decode_hidden_states)
385+
attn_cuda_graphs.replay()
386+
# self.attn_forward(forward_meta, decode_hidden_states)
386387

387-
end_events[i].record()
388-
paddle.device.synchronize()
388+
end_events[i].record()
389+
paddle.device.synchronize()
389390

390-
times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
391-
print(times[-5:])
391+
times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
392+
print(times[-5:])
392393

393-
p.stop()
394+
# p.stop()
394395

395396

396397
if __name__ == "__main__":

tests/layers/test_ffn.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,17 @@
3939
from fastdeploy.worker.worker_process import init_distributed_environment
4040

4141
paddle.set_default_dtype("bfloat16")
42+
if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower():
43+
# (ZKK): CI machine.
44+
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")
4245

4346

4447
class FFNWrapper(paddle.nn.Layer):
4548
def __init__(self, model_config: ModelConfig):
4649
super().__init__()
4750
self.model_config = model_config
4851

49-
self.intermediate_size = 3584
52+
self.intermediate_size = self.model_config.intermediate_size
5053
self.hidden_size = self.model_config.hidden_size
5154
self.prefix = "hahahha"
5255
self.fd_config = FDConfig(
@@ -94,10 +97,9 @@ def __init__(self, model_config: ModelConfig):
9497
class TestFusedMoE(unittest.TestCase):
9598
def setUp(self) -> None:
9699
self.architectures = ["Ernie4_5_MoeForCausalLM"]
97-
self.hidden_size = 7168
98-
self.moe_intermediate_size = 1
99-
self.moe_num_experts = 1
100-
self.moe_k = 1
100+
self.hidden_size = 4096
101+
self.intermediate_size = 2048
102+
self.num_layers = 1
101103
self.hidden_act = "silu"
102104
self.num_attention_heads = 64
103105
self.model_config = self.build_model_config()
@@ -115,9 +117,7 @@ def build_config_json(self) -> str:
115117
config_dict = {
116118
"architectures": self.architectures,
117119
"hidden_size": self.hidden_size,
118-
"moe_intermediate_size": self.moe_intermediate_size,
119-
"moe_num_experts": self.moe_num_experts,
120-
"moe_k": self.moe_k,
120+
"intermediate_size": self.intermediate_size,
121121
"hidden_act": self.hidden_act,
122122
"num_attention_heads": self.num_attention_heads,
123123
"dtype": "bfloat16",
@@ -135,20 +135,17 @@ def test_ffn(self):
135135

136136
ffn = FFNWrapper(self.model_config)
137137

138-
# (ZKK): disable this test,
139-
# CI machine does not support deepgemm blockwise_fp8, compilation error.
140-
return
141-
142138
moe_cuda_graphs = [None] * 100
143139
cache_hidden_states = [None] * 100
144-
for idx, num_tokens in enumerate([10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 512, 1024, 2048, 4096]):
140+
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 4096, 4096 * 4]
141+
for idx, num_tokens in enumerate(test_token_nums):
145142

146143
cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16)
147144

148145
moe_cuda_graphs[idx] = graphs.CUDAGraph()
149146
moe_cuda_graphs[idx].capture_begin()
150147

151-
num_layers = 80
148+
num_layers = self.num_layers
152149
for _ in range(num_layers):
153150
out = ffn.ffn(cache_hidden_states[idx])
154151

tests/layers/test_fusedmoe.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -535,12 +535,12 @@ def __init__(
535535
class TestFusedMoE(unittest.TestCase):
536536
def setUp(self) -> None:
537537
self.architectures = ["Ernie4_5_MoeForCausalLM"]
538-
self.hidden_size = 7168
539-
self.moe_intermediate_size = 3584
540-
self.moe_num_experts = 64
538+
self.hidden_size = 4096
539+
self.moe_intermediate_size = 2048
540+
self.moe_num_experts = 160
541541
self.moe_k = 8
542-
self.hidden_act = "silu"
543-
self.num_attention_heads = 64
542+
self.num_layers = 2
543+
self.num_attention_heads = -1
544544
self.model_config = self.build_model_config()
545545

546546
def build_model_config(self) -> ModelConfig:
@@ -559,7 +559,6 @@ def build_config_json(self) -> str:
559559
"moe_intermediate_size": self.moe_intermediate_size,
560560
"moe_num_experts": self.moe_num_experts,
561561
"moe_k": self.moe_k,
562-
"hidden_act": self.hidden_act,
563562
"num_attention_heads": self.num_attention_heads,
564563
"dtype": "bfloat16",
565564
}
@@ -590,16 +589,18 @@ def test_fused_moe(self):
590589
# 这行代码必须保留,否则影响均匀性!
591590
paddle.seed(ep_rank + 100)
592591

593-
num_layers = 80
594-
real_weight_layers = 20
592+
num_layers = self.num_layers
593+
real_weight_layers = num_layers // 2
595594
fused_moe = [None] * real_weight_layers
596595
for i in range(real_weight_layers):
597596
fused_moe[i] = FuseMoEWrapper(self.model_config, tp_size, tp_rank, ep_size, ep_rank, nnodes=nnodes)
598597

599598
moe_cuda_graphs = [None] * 100
600599
cache_hidden_states = [None] * 100
601-
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256]
602-
# test_token_nums = [1024 * i for i in [1,2,4,8,16,32]]
600+
is_decoder = fused_moe[0].fd_config.model_config.moe_phase.phase == "decode"
601+
test_token_nums = [4096 * i for i in [1, 2, 4, 8]]
602+
if is_decoder:
603+
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256]
603604
for idx, num_tokens in enumerate(test_token_nums):
604605

605606
cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16)
@@ -610,20 +611,25 @@ def fake_model_run():
610611

611612
return out
612613

613-
moe_cuda_graphs[idx] = graphs.CUDAGraph()
614-
moe_cuda_graphs[idx].capture_begin()
614+
if is_decoder:
615+
moe_cuda_graphs[idx] = graphs.CUDAGraph()
616+
moe_cuda_graphs[idx].capture_begin()
615617

616618
fake_model_run()
617619

618-
moe_cuda_graphs[idx].capture_end()
620+
if is_decoder:
621+
moe_cuda_graphs[idx].capture_end()
619622

620623
num_tests = 20
621624
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)]
622625
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)]
623626
for i in range(num_tests):
624627
start_events[i].record()
625628

626-
moe_cuda_graphs[idx].replay()
629+
if is_decoder:
630+
moe_cuda_graphs[idx].replay()
631+
else:
632+
fake_model_run()
627633

628634
end_events[i].record()
629635
paddle.device.cuda.synchronize()

0 commit comments

Comments
 (0)