Skip to content

Commit 95f47aa

Browse files
authored
[nightly] Update auto-gened configs to nightly (#652)
1 parent f28882e commit 95f47aa

File tree

16 files changed

+245
-272
lines changed

16 files changed

+245
-272
lines changed

benchmarks/gen_metadata/run.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212

1313
from os.path import abspath, exists
1414

15-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
16-
1715

1816
def setup_tritonbench_cwd():
1917
original_dir = abspath(os.getcwd())
@@ -35,6 +33,7 @@ def setup_tritonbench_cwd():
3533
setup_tritonbench_cwd()
3634

3735
from tritonbench.operators import list_operators, load_opbench_by_name
36+
from tritonbench.utils.path_utils import REPO_PATH
3837

3938
# operators that are not supported by tritonbench-oss
4039
SKIP_OPERATORS = ["decoding_attention"]
@@ -105,7 +104,7 @@ def main() -> None:
105104
parser.add_argument(
106105
"--output",
107106
type=str,
108-
default=os.path.join(CURRENT_DIR, "metadata"),
107+
default=os.path.join(REPO_PATH.joinpath("tritonbench"), "metadata"),
109108
help="generate metadata yaml files to the specific directory",
110109
)
111110
args = parser.parse_args()

benchmarks/nightly/autogen.yaml

Lines changed: 123 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -1,146 +1,145 @@
11
fp16_addmm_fwd:
2-
op: addmm
3-
args: --op addmm --baseline aten_addmm --metrics tflops,speedup --only triton_addmm,streamk_addmm,pt2_triton_matmul,aten_addmm
2+
args: --op addmm --baseline aten_addmm --metrics latency,tflops,speedup --only streamk_addmm,triton_addmm,aten_addmm
3+
fp16_addmm_bwd:
4+
args: --op addmm --baseline aten_addmm --metrics latency,tflops,speedup --only streamk_addmm,triton_addmm,aten_addmm
5+
--bwd
46
bf16xint16_gemm_fwd:
5-
op: bf16xint16_gemm
6-
args: --op bf16xint16_gemm --metrics tflops --only bf16xbf16
7+
args: --op bf16xint16_gemm --baseline bf16xbf16 --metrics latency,tflops,speedup
8+
--only bf16xbf16,bf16xint16,bf16xint16_casted
9+
bf16_blackwell_attentions_fwd:
10+
args: --op blackwell_attentions --metrics latency,tflops --only triton_tutorial_flash_dp_blackwell,triton_tutorial_flash_dp_persistent_blackwell,triton_tutorial_flash_v2_blackwell,triton_tutorial_flash_v2_persistent_blackwell,triton_tutorial_flash_v2_tma_ws_persistent_blackwell
11+
disabled: true
12+
bf16_blackwell_attentions_bwd:
13+
args: --op blackwell_attentions --metrics latency,tflops --only triton_tutorial_flash_dp_blackwell,triton_tutorial_flash_dp_persistent_blackwell,triton_tutorial_flash_v2_blackwell,triton_tutorial_flash_v2_persistent_blackwell,triton_tutorial_flash_v2_tma_ws_persistent_blackwell
14+
--bwd
15+
disabled: true
16+
cross_entropy_fwd:
17+
args: --op cross_entropy --baseline cross_entropy_loss --metrics latency,speedup
18+
--only liger_cross_entropy_loss,cross_entropy_loss
19+
cross_entropy_bwd:
20+
args: --op cross_entropy --baseline cross_entropy_loss --metrics latency,speedup
21+
--only liger_cross_entropy_loss,cross_entropy_loss --bwd
22+
embedding_fwd:
23+
args: --op embedding --baseline torch_embedding --metrics latency,speedup --only
24+
liger_embedding,torch_embedding
25+
embedding_bwd:
26+
args: --op embedding --baseline torch_embedding --metrics latency,speedup --only
27+
liger_embedding,torch_embedding --bwd
728
bf16_flash_attention_fwd:
8-
op: flash_attention
929
args: --op flash_attention --baseline flash_v3 --metrics tflops,speedup --only triton_tutorial_flash_v2,flash_v3
10-
bf16_flex_attention_fwd:
11-
op: flex_attention
12-
args: --op flex_attention --baseline eager --metrics tflops,speedup --only compiled,eager
30+
bf16_flash_attention_bwd:
31+
args: --op flash_attention --baseline flash_v3 --metrics tflops,speedup --bwd --only
32+
triton_tutorial_flash_v2,flash_v3 --causal
1333
fp8_attention_fwd:
14-
op: fp8_attention
15-
args: --op fp8_attention --metrics tflops --only triton_flash_v2_tma
16-
disabled: true
17-
fp8_fused_quant_gemm_rowwise_fwd:
18-
op: fp8_fused_quant_gemm_rowwise
19-
args: --op fp8_fused_quant_gemm_rowwise --metrics tflops --only rms_norm_fused
34+
args: --op fp8_attention --baseline triton_flash_v2 --metrics latency,tflops,speedup
35+
--only triton_flash_v2,triton_flash_v2_tma,triton_flash_v2_ws
2036
disabled: true
2137
fp8_gemm_fwd:
22-
op: fp8_gemm
23-
args: --op fp8_gemm --baseline torch_fp8_gemm --metrics tflops,speedup --only triton_tma_persistent_fp8_gemm,torch_fp8_gemm
38+
args: --op fp8_gemm --baseline torch_fp8_gemm --metrics latency,tflops,speedup --only
39+
triton_fp8_gemm,triton_persistent_fp8_gemm,triton_tma_persistent_fp8_gemm,torch_fp8_gemm
2440
disabled: true
25-
fp8_gemm_blockwise_fwd:
26-
op: fp8_gemm_blockwise
27-
args: --op fp8_gemm_blockwise --baseline _cutlass --metrics tflops,speedup --only
28-
_triton,_cutlass
2941
fp8_gemm_rowwise_fwd:
30-
op: fp8_gemm_rowwise
31-
args: --op fp8_gemm_rowwise --baseline _cutlass_or_ck --metrics tflops,speedup --only
32-
_triton,_cutlass_or_ck
33-
disabled: true
34-
fp8_gemm_rowwise_grouped_fwd:
35-
op: fp8_gemm_rowwise_grouped
36-
args: --op fp8_gemm_rowwise_grouped --baseline _cutlass_or_ck --metrics tflops,speedup
37-
--only _triton,_cutlass_or_ck
42+
args: --op fp8_gemm_rowwise --baseline _cutlass_or_ck --metrics latency,tflops,speedup
43+
--only _aoti_fp8_triton_mm,_cutlass_or_ck
3844
disabled: true
39-
fp16_gemm_fwd:
40-
op: gemm
41-
args: --op gemm --baseline aten_matmul --metrics speedup --only triton_tutorial_matmul,aten_matmul
42-
fp16_grouped_gemm_fwd:
43-
op: grouped_gemm
44-
args: --op grouped_gemm --baseline torch --metrics tflops,speedup --only triton,torch
45-
int4_gemm_fwd:
46-
op: int4_gemm
47-
args: --op int4_gemm --baseline tinygemm --metrics tflops,speedup --only triton,tinygemm
48-
low_mem_dropout_fwd:
49-
op: low_mem_dropout
50-
args: --op low_mem_dropout --baseline torch_dropout --metrics speedup --only triton_dropout,torch_dropout
51-
bf16_ragged_attention_fwd:
52-
op: ragged_attention
53-
args: --op ragged_attention --metrics tflops --only hstu
54-
softmax_fwd:
55-
op: softmax
56-
args: --op softmax --baseline naive_softmax --metrics speedup --only triton_softmax,naive_softmax
57-
welford_fwd:
58-
op: welford
59-
args: --op welford --baseline test_no_welford --metrics speedup --only test_welford,test_no_welford
60-
jsd_fwd:
61-
op: jsd
62-
args: --op jsd --baseline torch_jsd --metrics speedup --only liger_jsd,torch_jsd
63-
kl_div_fwd:
64-
op: kl_div
65-
args: --op kl_div --baseline torch_kl_div --metrics speedup --only liger_kl_div,torch_kl_div
66-
layer_norm_fwd:
67-
op: layer_norm
68-
args: --op layer_norm --baseline torch_layer_norm --metrics speedup --only liger_layer_norm,torch_layer_norm
69-
swiglu_fwd:
70-
op: swiglu
71-
args: --op swiglu --baseline torch_swiglu --metrics speedup --only liger_swiglu,torch_swiglu
72-
rope_fwd:
73-
op: rope
74-
args: --op rope --baseline apply_rotary_pos_emb --metrics speedup --only liger_rotary_pos_emb,apply_rotary_pos_emb
75-
rms_norm_fwd:
76-
op: rms_norm
77-
args: --op rms_norm --baseline llama_rms --metrics speedup --only liger_rms,llama_rms
7845
fused_linear_cross_entropy_fwd:
79-
op: fused_linear_cross_entropy
80-
args: --op fused_linear_cross_entropy --baseline torch_lm_head_ce --metrics speedup
46+
args: --op fused_linear_cross_entropy --baseline torch_lm_head_ce --metrics latency,speedup
8147
--only liger_lm_head_ce,torch_lm_head_ce
82-
fused_linear_jsd_fwd:
83-
op: fused_linear_jsd
84-
args: --op fused_linear_jsd --baseline torch_lm_head_jsd --metrics speedup --only
85-
liger_lm_head_jsd,torch_lm_head_jsd
86-
geglu_fwd:
87-
op: geglu
88-
args: --op geglu --baseline torch_geglu --metrics speedup --only liger_geglu,torch_geglu
89-
cross_entropy_fwd:
90-
op: cross_entropy
91-
args: --op cross_entropy --baseline cross_entropy_loss --metrics speedup --only
92-
liger_cross_entropy_loss,cross_entropy_loss
93-
embedding_fwd:
94-
op: embedding
95-
args: --op embedding --baseline torch_embedding --metrics speedup --only liger_embedding,torch_embedding
96-
cross_entropy_bwd:
97-
op: cross_entropy
98-
args: --op cross_entropy --baseline cross_entropy_loss --metrics speedup --bwd --only
99-
liger_cross_entropy_loss,cross_entropy_loss
100-
embedding_bwd:
101-
op: embedding
102-
args: --op embedding --baseline torch_embedding --metrics speedup --bwd --only liger_embedding,torch_embedding
103-
bf16_flash_attention_bwd:
104-
op: flash_attention
105-
args: --op flash_attention --baseline flash_v3 --metrics tflops,speedup --bwd --only
106-
triton_tutorial_flash_v2,flash_v3 --causal
107-
bf16_flex_attention_bwd:
108-
op: flex_attention
109-
args: --op flex_attention --baseline eager --metrics tflops,speedup --bwd --only
110-
compiled,eager
11148
fused_linear_cross_entropy_bwd:
112-
op: fused_linear_cross_entropy
113-
args: --op fused_linear_cross_entropy --baseline torch_lm_head_ce --metrics speedup
114-
--bwd --only liger_lm_head_ce,torch_lm_head_ce
115-
fused_linear_jsd_bwd:
116-
op: fused_linear_jsd
117-
args: --op fused_linear_jsd --baseline torch_lm_head_jsd --metrics speedup --bwd
49+
args: --op fused_linear_cross_entropy --baseline torch_lm_head_ce --metrics latency,speedup
50+
--only liger_lm_head_ce,torch_lm_head_ce --bwd
51+
fused_linear_jsd_fwd:
52+
args: --op fused_linear_jsd --baseline torch_lm_head_jsd --metrics latency,speedup
11853
--only liger_lm_head_jsd,torch_lm_head_jsd
54+
fused_linear_jsd_bwd:
55+
args: --op fused_linear_jsd --baseline torch_lm_head_jsd --metrics latency,speedup
56+
--only liger_lm_head_jsd,torch_lm_head_jsd --bwd
57+
gather_gemv_fwd:
58+
args: --op gather_gemv --baseline eager_gather_gemv --metrics latency,speedup --only
59+
triton_gather_gemv,eager_gather_gemv
60+
geglu_fwd:
61+
args: --op geglu --baseline torch_geglu --metrics latency,speedup --only liger_geglu,torch_geglu
11962
geglu_bwd:
120-
op: geglu
121-
args: --op geglu --baseline torch_geglu --metrics speedup --bwd --only liger_geglu,torch_geglu
63+
args: --op geglu --baseline torch_geglu --metrics latency,speedup --only liger_geglu,torch_geglu
64+
--bwd
65+
fp16_gemm_fwd:
66+
args: --op gemm --baseline aten_matmul --metrics latency,speedup --only matmul_partition_k,streamk_matmul,triton_ops_matmul,triton_tutorial_matmul,aten_matmul
67+
fp16_gemm_bwd:
68+
args: --op gemm --baseline aten_matmul --metrics latency,speedup --only matmul_partition_k,streamk_matmul,triton_ops_matmul,triton_tutorial_matmul,aten_matmul
69+
--bwd
70+
bf16_grouped_gemm_fwd:
71+
args: --op grouped_gemm --baseline aten_grouped_mm --metrics latency,tflops,speedup
72+
--only triton_grouped_gemm,aten_grouped_mm
73+
int4_gemm_fwd:
74+
args: --op int4_gemm --baseline eager_int4_gemm --metrics latency,tflops,speedup
75+
--only preprocessed_triton_int4_gemm,triton_int4_gemm,eager_int4_gemm
76+
fp32_jagged_mean_fwd:
77+
args: --op jagged_mean --baseline torch_jagged_mean_torch_sum --metrics latency,speedup
78+
--only triton_jagged_mean_simple_fused,triton_jagged_mean_variable_length_loop,torch_jagged_mean_torch_sum
79+
fp32_jagged_softmax_fwd:
80+
args: --op jagged_softmax --baseline torch_jagged_softmax_unbind_torch_softmax --metrics
81+
latency,speedup --only triton_jagged_softmax_simple_fused,triton_jagged_softmax_variable_length_loop,torch_jagged_softmax_unbind_torch_softmax
82+
fp32_jagged_sum_fwd:
83+
args: --op jagged_sum --baseline torch_jagged_sum_no_pad --metrics latency,speedup
84+
--only triton_jagged_sum_no_pad_simple_fused,triton_jagged_sum_no_pad_variable_length_loop,torch_jagged_sum_no_pad
85+
jsd_fwd:
86+
args: --op jsd --baseline torch_jsd --metrics latency,speedup --only liger_jsd,torch_jsd
12287
jsd_bwd:
123-
op: jsd
124-
args: --op jsd --baseline torch_jsd --metrics speedup --bwd --only liger_jsd,torch_jsd
88+
args: --op jsd --baseline torch_jsd --metrics latency,speedup --only liger_jsd,torch_jsd
89+
--bwd
90+
kl_div_fwd:
91+
args: --op kl_div --baseline torch_kl_div --metrics latency,speedup --only liger_kl_div,torch_kl_div
12592
kl_div_bwd:
126-
op: kl_div
127-
args: --op kl_div --baseline torch_kl_div --metrics speedup --bwd --only liger_kl_div,torch_kl_div
93+
args: --op kl_div --baseline torch_kl_div --metrics latency,speedup --only liger_kl_div,torch_kl_div
94+
--bwd
95+
launch_latency_fwd:
96+
args: --op launch_latency --metrics walltime
97+
layer_norm_fwd:
98+
args: --op layer_norm --baseline torch_layer_norm --metrics latency,speedup --only
99+
liger_layer_norm,triton_fused_layer_norm,triton_layer_norm,torch_layer_norm
128100
layer_norm_bwd:
129-
op: layer_norm
130-
args: --op layer_norm --baseline torch_layer_norm --metrics speedup --bwd --only
131-
liger_layer_norm,torch_layer_norm
101+
args: --op layer_norm --baseline torch_layer_norm --metrics latency,speedup --only
102+
liger_layer_norm,triton_fused_layer_norm,triton_layer_norm,torch_layer_norm --bwd
103+
low_mem_dropout_fwd:
104+
args: --op low_mem_dropout --baseline eager_dropout --metrics latency,speedup --only
105+
seeded_dropout,triton_dropout,eager_dropout
106+
bf16_ragged_attention_fwd:
107+
args: --op ragged_attention --metrics latency,tflops --only hstu
132108
bf16_ragged_attention_bwd:
133-
op: ragged_attention
134-
args: --op ragged_attention --metrics tflops --bwd --only hstu
109+
args: --op ragged_attention --metrics latency,tflops --only hstu --bwd
110+
rms_norm_fwd:
111+
args: --op rms_norm --baseline llama_rms --metrics latency,speedup --only liger_rms,triton_fused_rmsnorm,llama_rms
135112
rms_norm_bwd:
136-
op: rms_norm
137-
args: --op rms_norm --baseline llama_rms --metrics speedup --bwd --only liger_rms,llama_rms
113+
args: --op rms_norm --baseline llama_rms --metrics latency,speedup --only liger_rms,triton_fused_rmsnorm,llama_rms
114+
--bwd
115+
rope_fwd:
116+
args: --op rope --baseline apply_rotary_pos_emb --metrics latency,speedup --only
117+
liger_rotary_pos_emb,apply_rotary_pos_emb
138118
rope_bwd:
139-
op: rope
140-
args: --op rope --baseline apply_rotary_pos_emb --metrics speedup --bwd --only liger_rotary_pos_emb,apply_rotary_pos_emb
119+
args: --op rope --baseline apply_rotary_pos_emb --metrics latency,speedup --only
120+
liger_rotary_pos_emb,apply_rotary_pos_emb --bwd
121+
fp16_softmax_fwd:
122+
args: --op softmax --baseline naive_softmax --metrics latency,speedup --only triton_softmax,naive_softmax
123+
fp16_softmax_bwd:
124+
args: --op softmax --baseline naive_softmax --metrics latency,speedup --only triton_softmax,naive_softmax
125+
--bwd
126+
sum_fwd:
127+
args: --op sum --baseline torch_sum --metrics latency,speedup --only triton_sum,torch_sum
128+
swiglu_fwd:
129+
args: --op swiglu --baseline torch_swiglu --metrics latency,speedup --only liger_swiglu,torch_swiglu
141130
swiglu_bwd:
142-
op: swiglu
143-
args: --op swiglu --baseline torch_swiglu --metrics speedup --bwd --only liger_swiglu,torch_swiglu
144-
launch_latency:
145-
op: launch_latency
146-
args: --op launch_latency --metrics walltime
131+
args: --op swiglu --baseline torch_swiglu --metrics latency,speedup --only liger_swiglu,torch_swiglu
132+
--bwd
133+
template_attention_fwd:
134+
args: --op template_attention --baseline test_no_exp2 --metrics latency,speedup
135+
--only test_no_exp2,test_with_exp2
136+
vector_add_fwd:
137+
args: --op vector_add --baseline torch_add --metrics latency,speedup --only triton_add,torch_add
138+
vector_exp_fwd:
139+
args: --op vector_exp --baseline torch_exp --metrics latency,speedup --only triton_exp,torch_exp
140+
vector_exp_bwd:
141+
args: --op vector_exp --baseline torch_exp --metrics latency,speedup --only triton_exp,torch_exp
142+
--bwd
143+
welford_fwd:
144+
args: --op welford --baseline eager_layer_norm --metrics latency,speedup --only
145+
test_no_welford,triton_welford,eager_layer_norm

0 commit comments

Comments
 (0)