@@ -109,7 +109,7 @@ def encode(x, x_space, hparams, name):
109109 with tf .variable_scope (name ):
110110 (encoder_input , encoder_self_attention_bias ,
111111 _ ) = transformer .transformer_prepare_encoder (x , x_space , hparams )
112- encoder_input = tf .nn .dropout (encoder_input , 1.0 - hparams .residual_dropout )
112+ encoder_input = tf .nn .dropout (encoder_input , 1.0 - hparams .dropout )
113113 return transformer .transformer_encoder (
114114 encoder_input , encoder_self_attention_bias , hparams )
115115
@@ -143,7 +143,7 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
143143 max_prestep = hparams .kl_warmup_steps
144144 prob_targets = 0.95 if is_training else 1.0
145145 targets_dropout_max = common_layers .inverse_lin_decay (max_prestep ) - 0.01
146- targets = dropmask (targets , targets_dropout_max , is_training )
146+ targets = dropmask (targets , targets_dropout_max * 0.7 , is_training )
147147 targets = tf .cond (tf .less (tf .random_uniform ([]), prob_targets ),
148148 lambda : targets , lambda : tf .zeros_like (targets ))
149149
@@ -168,7 +168,7 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
168168 # ret = tf.squeeze(to_decode, axis=2)
169169
170170 # Randomize decoder inputs..
171- kl_loss *= common_layers .inverse_exp_decay (max_prestep ) * 3 .0
171+ kl_loss *= common_layers .inverse_exp_decay (max_prestep ) * 10 .0
172172 return tf .expand_dims (ret , axis = 2 ), kl_loss
173173
174174
0 commit comments