|
17 | 17 |
|
18 | 18 | * Text2TextProblem: input=text, target=text. |
19 | 19 | * Text2ClassProblem: input=text, target=class. |
20 | | -* Text2RealProblem: input=text, target=float. |
21 | 20 | * Text2SelfProblem (for language modeling): target=text |
22 | 21 | * QuestionAndContext2TextProblem: input=text, context=text, target=text. |
23 | 22 |
|
@@ -606,94 +605,6 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): |
606 | 605 | yield {"inputs": inputs, "targets": [label]} |
607 | 606 |
|
608 | 607 |
|
609 | | -class Text2RealProblem(Text2TextProblem): |
610 | | - """Base class for text regression problems with one or more tasks. |
611 | | - Suitable for text-based problems where targets are continuous, real values. |
612 | | - When ntasks = 1, each text example is mapped to a single scalar value. When |
613 | | - ntasks > 1, each text example is mapped to a 1-d vector of length ntasks. |
614 | | - """ |
615 | | - |
616 | | - @property |
617 | | - def ntasks(self): |
618 | | - """Set to n > 1 for multitask regression.""" |
619 | | - return 1 |
620 | | - |
621 | | - def generate_samples(self, data_dir, tmp_dir, dataset_split): |
622 | | - """Generate samples of text and real-valued target pairs. |
623 | | - Each yielded dict will be a single example. The inputs should be raw text. |
624 | | - The target should be a list containing ntasks floats. |
625 | | - Args: |
626 | | - data_dir: final data directory. Typically only used in this method to copy |
627 | | - over user-supplied vocab files (for example, if vocab_type == |
628 | | - VocabType.TOKEN). |
629 | | - tmp_dir: temporary directory that you can use for downloading and scratch. |
630 | | - dataset_split: problem.DatasetSplit, which data split to generate samples |
631 | | - for (for example, training and evaluation). |
632 | | - Yields: |
633 | | - {"inputs": text, "targets": [x1, x2, ..., xN]} where N is ntasks |
634 | | - """ |
635 | | - raise NotImplementedError() |
636 | | - |
637 | | - def generate_text_for_vocab(self, data_dir, tmp_dir): |
638 | | - for i, sample in enumerate( |
639 | | - self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN)): |
640 | | - yield sample["inputs"] |
641 | | - if self.max_samples_for_vocab and (i + 1) >= self.max_samples_for_vocab: |
642 | | - break |
643 | | - |
644 | | - def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): |
645 | | - generator = self.generate_samples(data_dir, tmp_dir, dataset_split) |
646 | | - encoder = self.get_or_create_vocab(data_dir, tmp_dir) |
647 | | - for sample in generator: |
648 | | - inputs = encoder.encode(sample["inputs"]) |
649 | | - inputs.append(text_encoder.EOS_ID) |
650 | | - yield {"inputs": inputs, "targets": sample["targets"]} |
651 | | - |
652 | | - def feature_encoders(self, data_dir): |
653 | | - encoder = self.get_or_create_vocab(data_dir, None, force_get=True) |
654 | | - |
655 | | - return { |
656 | | - "inputs": encoder, |
657 | | - "targets": text_encoder.RealEncoder(), |
658 | | - } |
659 | | - |
660 | | - def hparams(self, defaults, unused_model_hparams): |
661 | | - p = defaults |
662 | | - p.modality = { |
663 | | - "inputs": modalities.ModalityType.SYMBOL, |
664 | | - "targets": modalities.ModalityType.REAL_L2_LOSS, |
665 | | - } |
666 | | - p.vocab_size = { |
667 | | - "inputs": self._encoders["inputs"].vocab_size, |
668 | | - "targets": self.ntasks |
669 | | - } |
670 | | - p.target_space_id = problem.SpaceID.REAL |
671 | | - p.add_hparam("regression_targets", True) |
672 | | - |
673 | | - def max_length(self, model_hparams): |
674 | | - return model_hparams.batch_size * self.ntasks |
675 | | - |
676 | | - def preprocess_example(self, example, unused_mode, unused_hparams): |
677 | | - example = problem.preprocess_example_common(example, unused_mode, |
678 | | - unused_hparams) |
679 | | - example["targets"] = tf.reshape(example["targets"], [1, 1, self.ntasks]) |
680 | | - return example |
681 | | - |
682 | | - def example_reading_spec(self): |
683 | | - data_fields = { |
684 | | - "inputs": tf.VarLenFeature(tf.int64), |
685 | | - "targets": tf.FixedLenFeature([self.ntasks], tf.float32), |
686 | | - } |
687 | | - data_items_to_decoders = None |
688 | | - return (data_fields, data_items_to_decoders) |
689 | | - |
690 | | - def eval_metrics(self): |
691 | | - metrics_list = [metrics.Metrics.RMSE] |
692 | | - if self.ntasks == 1: |
693 | | - metrics_list.append(metrics.Metrics.PEARSON) |
694 | | - return metrics_list |
695 | | - |
696 | | - |
697 | 608 | def txt_line_iterator(txt_path): |
698 | 609 | """Iterate through lines of file.""" |
699 | 610 | with tf.gfile.Open(txt_path) as f: |
@@ -781,21 +692,6 @@ def text2class_txt_iterator(source_txt_path, label_txt_path, class_strs=None): |
781 | 692 | yield {"inputs": inputs, "label": label} |
782 | 693 |
|
783 | 694 |
|
784 | | -def text2real_txt_iterator(source_txt_path, target_txt_path): |
785 | | - """Yield dicts for Text2RealProblem.generate_samples from lines of files. |
786 | | - Args: |
787 | | - source_txt_path: txt file with record per line. |
788 | | - target_txt_path: txt file with float (or space-separated float list for |
789 | | - multitask) per line. |
790 | | - Yields: |
791 | | - {"inputs": inputs, "targets": targets} |
792 | | - """ |
793 | | - for inputs, targets in zip( |
794 | | - txt_line_iterator(source_txt_path), txt_line_iterator(target_txt_path)): |
795 | | - targets = [float(x) for x in targets.split(" ")] |
796 | | - yield {"inputs": inputs, "targets": targets} |
797 | | - |
798 | | - |
799 | 695 | def text2text_txt_tab_iterator(txt_path): |
800 | 696 | """Yield dicts for Text2TextProblem.generate_samples from lines of txt_path. |
801 | 697 |
|
|
0 commit comments