bigscience-workshop
diff --git a/‎tokenizer/python_script/dedup_exact_article.py‎
Lines changed: 118 additions & 0 deletions b/‎tokenizer/python_script/dedup_exact_article.py‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎tokenizer/python_script/dedup_lines.py‎
Lines changed: 280 additions & 0 deletions b/‎tokenizer/python_script/dedup_lines.py‎
Lines changed: 280 additions & 0 deletions
@@ -0,0 +1,118 @@
+"""Taken from Teven and Leandro"""
+import gzip
+import os
+import shutil
+import time
+import logging
+import argparse
+
+from datasets import load_from_disk
+from datasets.utils.logging import set_verbosity_info
+
+
+set_verbosity_info()
+logger = logging.getLogger(__name__)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Load seed and upload to hub")
+    parser.add_argument(
+        "--save-dir", required=True, type=str, help="Where to save the datasets."
+    )
+    parser.add_argument(
+        "--dataset_dir",
+        help="path to where the arrow dataset is located",
+        required=True,
+        type=str,
+    )
+    parser.add_argument(
+        "--batch-size",
+        help="Batch size used for the mapping and saving of the dataset",
+        required=True,
+        type=int,
+    )
+    parser.add_argument(
+        "--num-proc",
+        help="Number of processors used for the mapping and saving of the dataset",
+        required=True,
+        type=int,
+    )
+    args = parser.parse_args()
+    return args
+
+
+def get_hash(example):
+    """Get hash of content field."""
+    return {"hash": hash(example["text"].replace(" ", ""))}
+
+
+def check_uniques(example, uniques):
+    """Check if current hash is still in set of unique hashes and remove if true."""
+    if example["hash"] in uniques:
+        uniques.remove(example["hash"])
+        return True
+    else:
+        return False
+
+
+def preprocess(example):
+    """Chain all preprocessing steps into one function to not fill cache."""
+    results = dict()
+    results.update(get_hash(example))
+    return results
+
+
+def filter(example, uniques, args):
+    """Filter dataset with heuristics."""
+    if not check_uniques(example, uniques):
+        return False
+    else:
+        return True
+
+
+def compress_file(file_path):
+    """Compress a file with g-zip."""
+    with open(file_path, "rb") as f_in:
+        with gzip.open(file_path + ".gz", "wb", compresslevel=6) as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    os.unlink(file_path)
+
+
+def main():
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    args = get_args()
+
+    # Load dataset
+    t_start = time.time()
+    ds = load_from_disk(args.dataset_dir)
+    logger.info(f"Time to load dataset: {time.time()-t_start:.2f}")
+
+    # Run preprocessing
+    t_start = time.time()
+    ds = ds.map(preprocess, num_proc=args.num_proc)
+    logger.info(f"Time to preprocess dataset: {time.time()-t_start:.2f}")
+
+    # Deduplicate hashes
+    uniques = set(ds.unique("hash"))
+    frac = len(uniques) / len(ds)
+    logger.info(f"Fraction of duplicates: {1-frac:.2%}")
+
+    # Deduplicate data and apply heuristics
+    t_start = time.time()
+    ds_filter = ds.filter(filter, fn_kwargs={"uniques": uniques, "args": args})
+    logger.info(f"Time to filter dataset: {time.time()-t_start:.2f}")
+    logger.info(f"Size of filtered dataset: {len(ds_filter)}")
+
+    # Save data
+    t_start = time.time()
+    ds_filter.save_to_disk(args.save_dir)
+
+    logger.info(f"Time to save dataset: {time.time()-t_start:.2f}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,280 @@
+import json
+import shutil
+from collections import defaultdict
+import os
+import argparse
+import logging
+
+import datasets
+from functools import partial
+import pandas as pd
+from datasets import Features, load_dataset, load_from_disk
+from tqdm import tqdm
+from datasets.utils.logging import set_verbosity_info
+from numpy.random import SeedSequence, default_rng
+
+"""
+Cleaning text:
+ - run exact deduplication
+"""
+
+set_verbosity_info()
+logger = logging.getLogger(__name__)
+
+###
+# seed processing and upload functions
+###
+
+
+META_COLUMNS = ["meta"]
+
+# filter text to remove certain lines (e.g. menu items, copyright notice)
+def filter_lines(article, skip_set, used_lines):
+    # TODO discuss the strip
+    lines = [line.strip() for line in article.split("\n")]
+    keep = []
+    skip = []
+    for line in lines:
+        if line in skip_set and line in used_lines:
+            skip += [line]
+        elif line in skip_set:
+            keep += [line]
+            used_lines.add(line)
+        else:
+            keep += [line]
+    return "\n".join(keep).strip(), "\n".join(skip).strip()
+
+
+def filter_lines_by_batch(texts, skip_set, used_lines, preserve_code, metadata=None):
+    if preserve_code:
+        filtered_lines = [
+            filter_lines(article, skip_set, used_lines)
+            if "lm_code" in eval(metadata_item)["source_dataset"]
+            else (article, "")
+            for article, metadata_item in zip(texts, metadata)
+        ]
+    else:
+        filtered_lines = [
+            filter_lines(article, skip_set, used_lines) for article in texts
+        ]
+    return tuple(zip(*filtered_lines))
+
+
+# do both together and return an entry
+def process_batch(batch, skip_set, used_lines, args):
+    if not args.with_meta_col:
+        texts, _ = filter_lines_by_batch(
+            batch["text"], skip_set, used_lines, preserve_code=False
+        )
+        return {
+            "text": texts,
+        }
+    else:
+        texts, _ = filter_lines_by_batch(
+            batch["text"],
+            skip_set,
+            used_lines,
+            preserve_code=args.preserve_code,
+            metadata=batch["meta"],
+        )
+        return {
+            "meta": batch["meta"],
+            "text": texts,
+        }
+
+
+# looks at up to the first 10K pages for a seed and
+# records lines that appear in at least 1% of the unique pages
+def get_lines_to_skip(dset, n_records, pourcentage_threshold, min_repetition_threshold):
+    line_counts = defaultdict(lambda: 0)
+    seen_pages = defaultdict(lambda: 0)
+
+    seed = SeedSequence(42)
+    rng = default_rng(seed)
+    num_elements = min(len(dset), n_records)
+    indices = rng.choice(len(dset), size=num_elements, replace=False, shuffle=False)
+
+    dset_sample = dset.select(indices)
+    for page in tqdm(dset_sample):
+        article = page["text"]
+
+        seen_pages[article] += 1
+        # We count the number of times we see identical lines in different documents.
+        all_lines = {line.strip() for line in article.split("\n")}
+        for line in all_lines:
+            line_counts[line] += 1
+
+    # TODO understand this logic, why it's not len(line_counts)
+    if pourcentage_threshold is not None:
+        thres_skip = max(
+            min_repetition_threshold, len(seen_pages) * pourcentage_threshold
+        )
+    else:
+        thres_skip = min_repetition_threshold
+    skip_set = {line for line, ct in line_counts.items() if ct > thres_skip}
+    return skip_set, seen_pages
+
+
+def clean_examples(examples, skip_lines_set, used_lines, args):
+    if args.with_meta_col:
+        results = {"text": [], "meta": []}
+    else:
+        results = {"text": []}
+    # Collapses meta and cleans text
+    preprocessed_batch = process_batch(examples, skip_lines_set, used_lines, args)
+    assert set(results.keys()) == set(preprocessed_batch.keys())
+
+    for idx, cleaned_article in enumerate(preprocessed_batch["text"]):
+        if len(cleaned_article) <= args.min_chars:
+            continue
+        for key in results.keys():
+            results[key].append(preprocessed_batch[key][idx])
+
+    return results
+
+
+# create a private repository and push processed seed in jsonl format
+TEXT_COLUMN = "text"
+
+
+def filter_and_save(dset, skip_lines_set, seen_pages, args):
+    repo_name = args.save_dir
+    # TODO build a caching mechanism
+    repo_name_tmp = f"{repo_name}.tmp"
+    if not os.path.isdir(repo_name_tmp):
+        os.makedirs(repo_name_tmp)
+
+    # process
+    used_lines = set()
+    dset = dset.map(
+        partial(
+            clean_examples,
+            skip_lines_set=skip_lines_set,
+            used_lines=used_lines,
+            args=args,
+        ),
+        batched=True,
+        # num_proc=args.num_proc, # single proccess for used_lines
+        batch_size=args.batch_size,
+        remove_columns=dset.column_names,
+    )
+    logger.info(f"Finished cleaning")
+
+    # write to folder
+    dset.save_to_disk(repo_name_tmp)
+
+    logger.info(f"Ended successfully, saved at {repo_name_tmp}")
+
+    # Saving skipped lines that are considered repetitive
+    with open(os.path.join(repo_name_tmp, "skipped_lines.json"), "w") as fi:
+        json.dump(list(skip_lines_set), fi, indent=2)
+
+    # Saving num of duplicated documents
+    with open(os.path.join(repo_name_tmp, "duplicate_documents.json"), "w") as fi:
+        json.dump([num for num in list(seen_pages.values()) if num > 1], fi, indent=2)
+
+    # Move so that the state becomes completed
+    shutil.move(repo_name_tmp, repo_name)
+
+
+def text_is_not_none(batch):
+    return [text is not None for text in batch["text"]]
+
+
+###
+# combine everything
+###
+def main():
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save-dir", required=True, type=str, help="Where to save the datasets."
+    )
+    parser.add_argument(
+        "--dataset_dir",
+        help="path to where the arrow dataset is located",
+        required=True,
+        type=str,
+    )
+    parser.add_argument(
+        "--batch-size",
+        help="Batch size used for mapping the dataset",
+        required=True,
+        type=int,
+    )
+    parser.add_argument(
+        "--num-proc",
+        help="Number of processors used for the mapping of the dataset",
+        required=True,
+        type=int,
+    )
+    parser.add_argument(
+        "--min-chars",
+        help="Minimum number of chars in a line",
+        required=True,
+        type=int,
+    )
+    parser.add_argument(
+        "--n-records",
+        help="Number of records used to compute the repetitions",
+        required=True,
+        type=int,
+    )
+    parser.add_argument(
+        "--pourcentage-threshold",
+        help="Threshold used for filter repetitions",
+        default=None,
+        type=float,
+    )
+    parser.add_argument(
+        "--min-repetition-threshold",
+        help="Minimum threshold used for filter repetitions. Used when the number of available records is not enough",
+        required=True,
+        type=int,
+    )
+    parser.add_argument(
+        "--with-meta-col",
+        help="If the initial dataset has a meta column",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--preserve_code",
+        help="Exclude code datasets from the line dedup",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    # Load dataset (data first needs to be git pulled, see above)
+
+    dset = load_from_disk(args.dataset_dir)
+
+    # pre-remove unecessary columns, hopefully that saves qui a bit of memory usage
+    columns_to_keep = [TEXT_COLUMN] + META_COLUMNS
+    dset = dset.remove_columns(list(set(dset.column_names) - set(columns_to_keep)))
+
+    # Filter None text columns
+    number_of_samples_before = len(dset)
+    dset = dset.filter(text_is_not_none, batched=True, num_proc=args.num_proc)
+    number_of_samples_after_filtering_none = len(dset)
+    logger.info(
+        f"Filtered out {number_of_samples_before - number_of_samples_after_filtering_none} / {number_of_samples_before}"
+    )
+
+    skip_lines_set, seen_pages = get_lines_to_skip(
+        dset,
+        n_records=args.n_records,
+        pourcentage_threshold=args.pourcentage_threshold,
+        min_repetition_threshold=args.min_repetition_threshold,
+    )
+
+    filter_and_save(
+        dset, skip_lines_set=skip_lines_set, seen_pages=seen_pages, args=args
+    )
+    logger.info("Finished")
+
+
+if __name__ == "__main__":
+    main()