diff --git a/.gitignore b/.gitignore index 82f9275..a3ecbe4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +# Dump directory for prototyping and testing purposes +dump/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/src/model/lstm.py b/src/model/lstm.py index 0071751..aefb642 100644 --- a/src/model/lstm.py +++ b/src/model/lstm.py @@ -229,7 +229,7 @@ def generate(self, src_enc, src_len, max_len=200, sample_temperature=None): # add to unfinished sentences if cur_len == max_len: - generated[-1].masked_fill_(unfinished_sents.byte(), self.eos_index) + generated[-1].masked_fill_(unfinished_sents.bool(), self.eos_index) # sanity check assert (generated == self.eos_index).sum() == 2 * bs diff --git a/src/model/transformer.py b/src/model/transformer.py index 3f9cb92..c265ba1 100644 --- a/src/model/transformer.py +++ b/src/model/transformer.py @@ -711,7 +711,7 @@ def generate(self, src_enc, src_len, max_len=200, sample_temperature=None): # add to unfinished sentences if cur_len == max_len: - generated[-1].masked_fill_(unfinished_sents.byte(), self.eos_index) + generated[-1].masked_fill_(unfinished_sents.bool(), self.eos_index) # sanity check assert (generated == self.eos_index).sum() == 2 * bs diff --git a/src/optim.py b/src/optim.py index 4f87a80..5c84734 100644 --- a/src/optim.py +++ b/src/optim.py @@ -74,8 +74,8 @@ def step(self, closure=None): # grad.add_(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) denom = exp_avg_sq.sqrt().add_(group['eps']) # denom = exp_avg_sq.sqrt().clamp_(min=group['eps']) @@ -84,9 +84,9 @@ def step(self, closure=None): step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 if group['weight_decay'] != 0: - p.data.add_(-group['weight_decay'] * group['lr'], p.data) + p.data.add_(p.data, alpha=-group['weight_decay'] * group['lr']) - p.data.addcdiv_(-step_size, exp_avg, denom) + p.data.addcdiv_(exp_avg, denom, value=-step_size) return loss