File tree Expand file tree Collapse file tree 1 file changed +5
-1
lines changed Expand file tree Collapse file tree 1 file changed +5
-1
lines changed Original file line number Diff line number Diff line change @@ -692,7 +692,11 @@ def add_to_logging(name):
692692 # The factor of 4 is when used with activation check-pointing,
693693 # otherwise it will be 3, but for 200B model, activation check-pointing will always be on.
694694 checkpoint_activations_factor = 4 if args .checkpoint_activations else 3
695- flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size ** 2 )) * (1. + (seq_len / (6. * hidden_size )) + (vocab_size / (16. * num_layers * hidden_size )))
695+ # GLU activations double the hidden states in the upscaling feed-forward in each transformer layer
696+ # This leads to 16bsh^2 instead of 8bsh^2 per first feed-forward layer in MLP, thus we increase the coefficient by 8.
697+ # Refer to https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/283#issue-1260805063 for more details.
698+ coefficient = 32 if args .glu_activation else 24
699+ flops_per_iteration = (coefficient * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size ** 2 )) * (1. + (seq_len / (6. * hidden_size )) + (vocab_size / (16. * num_layers * hidden_size )))
696700 tflops = flops_per_iteration / (elapsed_time_per_iteration * args .world_size * (10 ** 12 ))
697701
698702 # only the last rank process has a non-None _GLOBAL_TENSORBOARD_WRITER
You can’t perform that action at this time.
0 commit comments