diff --git a/.amltignore b/.amltignore index 0d1eceb..2ae20ee 100644 --- a/.amltignore +++ b/.amltignore @@ -1,7 +1,7 @@ -__pycache__/ -data/ -models/ -temp/ -wandb/ -.vscode/ -.amltconfig +__pycache__/ +data/ +models/ +temp/ +wandb/ +.vscode/ +.amltconfig diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 3c0abac..0124c76 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,101 +1,101 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# -name: "CodeQL Advanced" - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - schedule: - - cron: '28 21 * * 3' - -jobs: - analyze: - name: Analyze (${{ matrix.language }}) - # Runner size impacts CodeQL analysis time. To learn more, please see: - # - https://gh.io/recommended-hardware-resources-for-running-codeql - # - https://gh.io/supported-runners-and-hardware-resources - # - https://gh.io/using-larger-runners (GitHub.com only) - # Consider using larger runners or machines with greater resources for possible analysis time improvements. - runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} - permissions: - # required for all workflows - security-events: write - - # required to fetch internal or private CodeQL packs - packages: read - - # only required for workflows in private repositories - actions: read - contents: read - - strategy: - fail-fast: false - matrix: - include: - - language: actions - build-mode: none - - language: python - build-mode: none - # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift' - # Use `c-cpp` to analyze code written in C, C++ or both - # Use 'java-kotlin' to analyze code written in Java, Kotlin or both - # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both - # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, - # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. - # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how - # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - # Add any setup steps before running the `github/codeql-action/init` action. - # This includes steps like installing compilers or runtimes (`actions/setup-node` - # or others). This is typically only required for manual builds. - # - name: Setup runtime (example) - # uses: actions/setup-example@v1 - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v4 - with: - languages: ${{ matrix.language }} - build-mode: ${{ matrix.build-mode }} - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - - # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs - # queries: security-extended,security-and-quality - - # If the analyze step fails for one of the languages you are analyzing with - # "We were unable to automatically build your code", modify the matrix above - # to set the build mode to "manual" for that language. Then modify this step - # to build your code. - # ℹ️ Command-line programs to run using the OS shell. - # πŸ“š See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - - name: Run manual build steps - if: matrix.build-mode == 'manual' - shell: bash - run: | - echo 'If you are using a "manual" build mode for one or more of the' \ - 'languages you are analyzing, replace this with the commands to build' \ - 'your code, for example:' - echo ' make bootstrap' - echo ' make release' - exit 1 - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v4 - with: - category: "/language:${{matrix.language}}" +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL Advanced" + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + schedule: + - cron: '28 21 * * 3' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: actions + build-mode: none + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Add any setup steps before running the `github/codeql-action/init` action. + # This includes steps like installing compilers or runtimes (`actions/setup-node` + # or others). This is typically only required for manual builds. + # - name: Setup runtime (example) + # uses: actions/setup-example@v1 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # ℹ️ Command-line programs to run using the OS shell. + # πŸ“š See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - name: Run manual build steps + if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: "/language:${{matrix.language}}" diff --git a/.gitignore b/.gitignore index 0d1eceb..2ae20ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ -__pycache__/ -data/ -models/ -temp/ -wandb/ -.vscode/ -.amltconfig +__pycache__/ +data/ +models/ +temp/ +wandb/ +.vscode/ +.amltconfig diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f9ba8cf..c72a574 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,9 +1,9 @@ -# Microsoft Open Source Code of Conduct - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). - -Resources: - -- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) -- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) -- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/LICENSE b/LICENSE index 9e841e7..3d8b93b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ - MIT License - - Copyright (c) Microsoft Corporation. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/README.md b/README.md index f119403..22c7a5b 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,55 @@ -# Data Efficacy for Language Model Training +# Data Efficacy

- Task - Paper + Task License

-

- [πŸ“œ Paper] β€’ - [🐱 GitHub Code] β€’ - [πŸ€— HF Model] -

+Large-scale model training benefits from data at scale, but the value of a dataset also depends on how effectively it is used. **Data Efficacy** studies how to turn available data into stronger training signal by scoring samples, selecting useful subsets, and organizing them into effective training sequences. -
- Figure 1 -
-
Figure 1. Average result across 8 benchmarks for different methods. High performance at the same selection ratio indicates high efficacy, while achieving similar performance with a smaller selection ratio demonstrates high efficiency. Our method excels in both efficacy and efficiency.
-
-
+## Introduction -## 🌟 Introduction -Data is fundamental to the training of language models (LM). Recent research has been dedicated to data efficiency, which aims to maximize performance by selecting a minimal or optimal subset of training data. Techniques such as data filtering, sampling, and selection play a crucial role in this area. To complement it, we define Data Efficacy, which focuses on maximizing performance by optimizing the organization of training data and remains relatively underexplored. This work introduces a general paradigm, DELT, for considering data efficacy in LM training, which highlights the significance of training data organization. DELT comprises three components: Data Scoring, Data Selection, and Data Ordering. +Large-scale model training depends heavily on data curation. Many data efficiency methods compute expensive sample-level scores for quality, difficulty, learnability, or relevance, but these scores are often used only once for filtering. -
- Figure 2 -
-
Figure 2. DELT paradigm.
-
-
+Data Efficacy aims to reuse such scores more fully across the training pipeline. In this repository, that shared pipeline is organized around four reusable stages: -
+- **Data Scoring** estimates sample-level utility. +- **Data Selection** chooses useful subsets under a data or compute budget. +- **Data Ordering** organizes selected samples into an effective training sequence. +- **Model Training and Evaluation** measure whether the curated data improves downstream performance. -For data scoring, we design **Learnability-Quality Scoring (LQS)** method, which considers both the learnability and quality of each data sample from the gradient consistency perspective. +![Data efficacy pipeline](./figures/data_efficacy_paradigm.png) -
- Figure 3 -
-
Figure 3. Learnability-Quality Scoring (LQS).
-
-
+## News -
+- **2026/05**: Added the ACL 2026 follow-up work **Demystifying Data Organization for Enhanced LLM Training**, with new data organization methods under `data_ordering`. +- **2025/08**: Released the codebase for general-domain pre-training. +- **2025/06**: Released **Data Efficacy for Language Model Training (DELT)** on arXiv. -For data ordering, we devise **Folding Ordering (FO)** method, which addresses issues such as model forgetting and data distribution bias. +## Works -
- Figure 4 -
-
Figure 4. Folding Ordering (FO).
-
-
+### Demystifying Data Organization for Enhanced LLM Training ([Paper](https://openreview.net/forum?id=i409rQuIfB) | [README](./docs/demystifying_data_organization_for_enhanced_llm_training.md)) +This work studies how to organize scored training data (data ordering) and introduces practical guidances for boundary sharpening, cyclic scheduling, curriculum continuity, and local diversity. -## πŸ“’ News and Updates +### Data Efficacy for Language Model Training ([Paper](https://arxiv.org/abs/2506.21545) | [README](./docs/data_efficacy_for_language_model_training.md)) -Done -- [x] 2025/06/28: πŸ’₯The [Arxiv paper](https://arxiv.org/abs/2506.21545) released. -- [x] 2025/08/31: πŸ’₯The DELT code released for pre-training on general domain. +This work introduces a data efficacy pipeline for language model training that reuses sample-level scores across data scoring, data selection, and data ordering. -TBD -- [ ] Release the model of LQS data scorer on general domain (CommonCrawl). -- [ ] Release the DELT code for post-training on specific domain. +## Repo Structure +```text +. +β”œβ”€β”€ data_scoring/ # Compute sample-level scores, including LQS and KenLM-based scoring. +β”œβ”€β”€ data_selection/ # Select subsets with top-r, top-k, or threshold methods. +β”œβ”€β”€ data_ordering/ # Organize scored data with sorting, folding, zig-zag, segment, STR, and SAW. +β”œβ”€β”€ model_train/ # Train models on curated data. +β”œβ”€β”€ model_eval/ # Evaluate trained models. +β”œβ”€β”€ docs/ # Paper-specific documentation and assets. +└── figures/ # Figures used by repository documentation. +``` -## βš™οΈ Environment Installation +## Installation ```bash conda create -n data_efficacy python=3.10 -y @@ -71,7 +57,9 @@ conda activate data_efficacy pip install -r requirements.txt ``` -## πŸ’Ύ Preparation. +For lightweight data ordering only, `numpy` and `pyyaml` are sufficient. + +## Preparation
Environment Variables @@ -88,97 +76,137 @@ export WANDB_API_KEY="" ```bash python utils.py --content dataset --id $HF_DATASET_ID --save-dir $OUTPUT_DATA_PATH -# e.g. python utils.py --content=dataset --id=togethercomputer/RedPajama-Data-1T --save-dir=data/source-cc-1b.jsonl --data-name=common_crawl --split-name=train --sample-size=500000 -# If you want to try the dataset used in the paper, please use the below commandline: -# python utils.py --content=dataset --id=togethercomputer/RedPajama-Data-1T-Sample --save-dir=data/source-cc-1b.jsonl -# You could also replace it with your own dataset under jsonl format. +# Example: +python utils.py \ + --content dataset \ + --id togethercomputer/RedPajama-Data-1T \ + --save-dir data/source-cc-1b.jsonl \ + --data-name common_crawl \ + --split-name train \ + --sample-size 500000 ``` + +You can also use your own JSONL dataset.
Model ```bash -python utils.py --content=model --id $HF_MODEL_ID --save-dir $OUTPUT_MODEL_PATH +python utils.py --content model --id $HF_MODEL_ID --save-dir $OUTPUT_MODEL_PATH -# e.g. python utils.py --content=model --id=Data-Selection/BSL-160M --save-dir=models/mistral-160m -# You could also replace it with your own model under hf format. +# Example: +python utils.py \ + --content model \ + --id Data-Selection/BSL-160M \ + --save-dir models/mistral-160m ```
-## ⏩ Quick Start. +## Pipeline Usage + +The repository exposes each stage through a separate entry script. You can run the full scoring-selection-ordering-training pipeline or reuse only the stages needed by a specific paper.
Data Scoring -Existing scoring method: **Learnability-Quality Score** (`lqs`), and Perplexity (`kenlm`). -For more details about LQS, please refer to [this guideline](./data_scoring/lqs/README.md). +Existing scoring methods include **Learnability-Quality Score** (`lqs`) and Perplexity (`kenlm`). For LQS details, see [data_scoring/lqs/README.md](./data_scoring/lqs/README.md). ```bash bash data_scoring/entry.sh $INPUT_DATA_PATH $OUTPUT_DATA_PATH $METHOD $CONFIG_PATH -# e.g. bash data_scoring/entry.sh data/source-cc-1b.jsonl data/source-cc-1b_scored-lqs.jsonl lqs data_scoring/config/lqs.yaml -# Please note that LQS involves downloading Hugging Face gated models/datasets, and you need to configure it. +# Example: +bash data_scoring/entry.sh \ + data/source-cc-1b.jsonl \ + data/source-cc-1b_scored-lqs.jsonl \ + lqs \ + data_scoring/config/lqs.yaml ```
Data Selection -Existing selection method: **Top-R** (`top-r`), Threshold (`threshold`), and Top-K (`top-k`). +Existing selection methods include **Top-R** (`top-r`), Threshold (`threshold`), and Top-K (`top-k`). ```bash bash data_selection/entry.sh $INPUT_DATA_PATH $OUTPUT_DATA_PATH $METHOD $CONFIG_PATH -# e.g. bash data_selection/entry.sh data/source-cc-1b_scored-lqs.jsonl data/source-cc-1b_scored-lqs_selected-r1.0.jsonl top-r data_selection/config/top-r.yaml +# Example: +bash data_selection/entry.sh \ + data/source-cc-1b_scored-lqs.jsonl \ + data/source-cc-1b_scored-lqs_selected-r1.0.jsonl \ + top-r \ + data_selection/config/top-r.yaml ```
Data Ordering -Existing ordering method: **Folding Ordering (FO)** (`folding`), Shuffle (`shuffle`), and Sorting (`sorting`). +Existing ordering methods include Sorting (`sorting`), Folding Ordering (`folding`), Zig-zag Ordering (`zigzag`), Segment Ordering (`segment`), Stair Ordering / STR (`stair`), Saw Ordering / SAW (`saw`), and Shuffle (`shuffle`). For the ACL 2026 data organization work, see [Demystifying Data Organization for Enhanced LLM Training](./docs/demystifying_data_organization_for_enhanced_llm_training.md). ```bash bash data_ordering/entry.sh $INPUT_DATA_PATH $OUTPUT_DATA_PATH $METHOD $CONFIG_PATH -# e.g. bash data_ordering/entry.sh data/source-cc-1b_scored-lqs_selected-r1.0.jsonl data/source-cc-1b_scored-lqs_selected-r1.0_ordered-folding-l3.jsonl folding data_ordering/config/folding.yaml +# Example: +bash data_ordering/entry.sh \ + data/source-cc-1b_scored-lqs_selected-r1.0.jsonl \ + data/source-cc-1b_scored-lqs_selected-r1.0_ordered-saw.jsonl \ + saw \ + data_ordering/config/saw.yaml ```
-
Model Training ```bash bash model_train/entry.sh $INPUT_DATA_PATH $INPUT_MODEL_PATH $OUTPUT_MODEL_PATH $METHOD $CONFIG_PATH -# e.g. bash model_train/entry.sh data/source-cc-1b_scored-lqs_selected-r1.0_ordered-folding-l3.jsonl models/mistral-160m models/pretrain_mistral-160m_source-cc-1b_scored-lqs_selected-r1.0_ordered-folding-l3_src pretrain model_train/config/train.yaml +# Example: +bash model_train/entry.sh \ + data/source-cc-1b_scored-lqs_selected-r1.0_ordered-saw.jsonl \ + models/mistral-160m \ + models/pretrain_mistral-160m_source-cc-1b_ordered-saw \ + pretrain \ + model_train/config/train.yaml ```
-
Model Evaluation ```bash bash model_eval/entry.sh $INPUT_MODEL_PATH $OUTPUT_RESULT_PATH $METHOD $CONFIG_PATH -# e.g. bash model_eval/entry.sh models/pretrain_mistral-160m_source-cc-1b_scored-lqs_selected-r1.0_ordered-folding-l3_src models/pretrain_mistral-160m_source-cc-1b_scored-lqs_selected-r1.0_ordered-folding-l3_src/result.yaml lm_evaluation_harness model_eval/config/general.yaml +# Example: +bash model_eval/entry.sh \ + models/pretrain_mistral-160m_source-cc-1b_ordered-saw \ + models/pretrain_mistral-160m_source-cc-1b_ordered-saw/result.yaml \ + lm_evaluation_harness \ + model_eval/config/general.yaml ```
+## Citation -## πŸ”— Citation -``` +```bibtex @article{dai2025data, title={Data Efficacy for Language Model Training}, author={Yalun Dai and Yangyu Huang and Xin Zhang and Wenshan Wu and Chong Li and Wenhui Lu and Shijie Cao and Li Dong and Scarlett Li}, journal={arXiv preprint arXiv:2506.21545}, year={2025} } + +@inproceedings{dai2026demystifying, + title={Demystifying Data Organization for Enhanced LLM Training}, + author={Yalun Dai and Yangyu Huang and Tongshen Yang and Yonghan Wang and Xin Zhang and Wenshan Wu and Qihao Zhao and Hao Li and Yuanyuan Gao and Kim-Hui Yap and Scarlett Li}, + booktitle={Proceedings of the Annual Meeting of the Association for Computational Linguistics}, + year={2026} +} ``` -## πŸ‘€ License -This repository is licensed under the [MIT](https://github.com/microsoft/DELT/blob/main/LICENSE) License. +## License +This repository is licensed under the [MIT](./LICENSE) License. diff --git a/SECURITY.md b/SECURITY.md index b3c89ef..6b906d4 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,41 +1,41 @@ - - -## Security - -Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). - -If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. - -## Reporting Security Issues - -**Please do not report security vulnerabilities through public GitHub issues.** - -Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). - -If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). - -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). - -Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: - - * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) - * Full paths of source file(s) related to the manifestation of the issue - * The location of the affected source code (tag/branch/commit or direct URL) - * Any special configuration required to reproduce the issue - * Step-by-step instructions to reproduce the issue - * Proof-of-concept or exploit code (if possible) - * Impact of the issue, including how an attacker might exploit the issue - -This information will help us triage your report more quickly. - -If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. - -## Preferred Languages - -We prefer all communications to be in English. - -## Policy - -Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). - - + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). + + diff --git a/SUPPORT.md b/SUPPORT.md deleted file mode 100644 index 291d4d4..0000000 --- a/SUPPORT.md +++ /dev/null @@ -1,25 +0,0 @@ -# TODO: The maintainer of this repo has not yet edited this file - -**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? - -- **No CSS support:** Fill out this template with information about how to file issues and get help. -- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. -- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. - -*Then remove this first heading from this SUPPORT.MD file before publishing your repo.* - -# Support - -## How to file issues and get help - -This project uses GitHub Issues to track bugs and feature requests. Please search the existing -issues before filing new issues to avoid duplicates. For new issues, file your bug or -feature request as a new Issue. - -For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE -FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER -CHANNEL. WHERE WILL YOU HELP PEOPLE?**. - -## Microsoft Support Policy - -Support for this **PROJECT or PRODUCT** is limited to the resources listed above. diff --git a/data_ordering/README.md b/data_ordering/README.md new file mode 100644 index 0000000..cd9fa0b --- /dev/null +++ b/data_ordering/README.md @@ -0,0 +1,28 @@ +# Data Ordering + +This module contains score-based data ordering methods for the Data Efficacy pipeline. + +Implemented methods: + +- `shuffle`: random ordering baseline. +- `sorting`: score sorting / curriculum learning baseline. +- `folding`: Folding Ordering (FO). +- `zigzag`: Zig-zag Ordering (ZIG). +- `segment`: Segment Ordering (SEG). +- `stair`: Stair Ordering (STR). +- `saw`: Saw Ordering (SAW). + +Run: + +```bash +bash data_ordering/entry.sh \ + data/scored_data.jsonl \ + data/ordered_saw.jsonl \ + saw \ + data_ordering/config/saw.yaml +``` + +For paper-specific details, see: + +- [Data Efficacy for Language Model Training](../docs/data_efficacy_for_language_model_training.md) +- [Demystifying Data Organization for Enhanced LLM Training](../docs/demystifying_data_organization_for_enhanced_llm_training.md) diff --git a/data_ordering/__init__.py b/data_ordering/__init__.py new file mode 100644 index 0000000..e3aba36 --- /dev/null +++ b/data_ordering/__init__.py @@ -0,0 +1,2 @@ +"""Data ordering strategies for the ACL 2026 data organization work.""" + diff --git a/data_ordering/common.py b/data_ordering/common.py new file mode 100644 index 0000000..35fa305 --- /dev/null +++ b/data_ordering/common.py @@ -0,0 +1,190 @@ +import ast +import json +import os +from typing import Any, Iterable, List, Mapping, Sequence + +import numpy as np + +try: + import yaml +except ImportError: # pragma: no cover - exercised only in minimal envs. + yaml = None + + +def _parse_scalar(value: str) -> Any: + value = value.strip() + if value == "": + return "" + + lowered = value.lower() + if lowered in {"true", "false"}: + return lowered == "true" + if lowered in {"none", "null"}: + return None + + try: + return ast.literal_eval(value) + except (SyntaxError, ValueError): + return value.strip("\"'") + + +def load_yaml(file_path: str) -> Mapping[str, Any]: + """Load the flat YAML configs used by the ordering scripts. + + PyYAML is preferred when available. The small fallback keeps + `data_ordering` runnable in a fresh Python environment because the module + only needs simple `key: value` config files. + """ + with open(file_path, "r", encoding="utf-8") as file: + if yaml is not None: + data = yaml.safe_load(file) + return data or {} + + data = {} + for raw_line in file: + line = raw_line.split("#", 1)[0].strip() + if not line or ":" not in line: + continue + key, value = line.split(":", 1) + data[key.strip()] = _parse_scalar(value) + return data + + +def load_jsonl(file_path: str) -> List[dict]: + with open(file_path, "r", encoding="utf-8") as file: + return [json.loads(line) for line in file if line.strip()] + + +def write_jsonl(file_path: str, data: Iterable[Mapping[str, Any]]) -> None: + parent = os.path.dirname(file_path) + if parent: + os.makedirs(parent, exist_ok=True) + + with open(file_path, "w", encoding="utf-8") as file: + for entry in data: + file.write(json.dumps(entry, ensure_ascii=False) + "\n") + + +def add_config_args(args: Any, method_params: Mapping[str, Any]) -> Any: + for field, value in method_params.items(): + if not hasattr(args, field) or getattr(args, field) is None: + setattr(args, field, value) + return args + + +def get_score(item: Mapping[str, Any], score_field: str) -> float: + if score_field not in item: + raise KeyError(f"Missing score field '{score_field}' in item: {item}") + return float(item[score_field]) + + +def sorted_indices(data: Sequence[Mapping[str, Any]], score_field: str, ascending: bool = True) -> List[int]: + return sorted( + range(len(data)), + key=lambda idx: get_score(data[idx], score_field), + reverse=not ascending, + ) + + +def sort_data(data: Sequence[Mapping[str, Any]], score_field: str, ascending: bool = True) -> List[dict]: + return [data[idx] for idx in sorted_indices(data, score_field, ascending)] + + +def gumbel_rank_jitter(indices: Sequence[int], tau: float = 1.0, use_gumbel: bool = False, seed: int = 42) -> List[int]: + if not use_gumbel: + return list(indices) + + rng = np.random.default_rng(seed) + noise = rng.gumbel(size=len(indices)) * tau + perturbed_positions = np.arange(len(indices), dtype=float) + noise + order = np.argsort(perturbed_positions, kind="stable") + idx = np.asarray(indices) + return list(idx[order]) + + +def window_based_shuffle(data: Sequence[dict], window_size: int = 0, seed: int = 42) -> List[dict]: + if window_size is None or window_size <= 1: + return list(data) + + rng = np.random.default_rng(seed) + shuffled_data: List[dict] = [] + for start in range(0, len(data), window_size): + chunk = list(data[start : start + window_size]) + rng.shuffle(chunk) + shuffled_data.extend(chunk) + return shuffled_data + + +def validate_layers(layers: int, name: str) -> int: + layers = int(layers) + if layers < 1: + raise ValueError(f"{name} must be >= 1, got {layers}") + return layers + + +def folding_order(sorted_data: Sequence[dict], layers: int) -> List[dict]: + layers = validate_layers(layers, "folding_layer") + ordered: List[dict] = [] + for layer in range(layers): + ordered.extend(sorted_data[layer::layers]) + return ordered + + +def zigzag_order(sorted_data: Sequence[dict], layers: int) -> List[dict]: + layers = validate_layers(layers, "zigzag_layer") + ordered: List[dict] = [] + for layer in range(layers): + layer_data = list(sorted_data[layer::layers]) + if layer % 2 == 1: + layer_data.reverse() + ordered.extend(layer_data) + return ordered + + +def cross_guidance_order( + sorted_data: Sequence[dict], + num_sections: int, + transition_ratio: float, + folding_layer: int, + mode: str, +) -> List[dict]: + """Apply STR/SAW transition regions on top of globally sorted data.""" + num_sections = validate_layers(num_sections, "num_sections") + folding_layer = validate_layers(folding_layer, "folding_layer") + if transition_ratio < 0: + raise ValueError(f"folding_ratio must be >= 0, got {transition_ratio}") + if mode not in {"folding", "zigzag"}: + raise ValueError(f"Unsupported transition mode: {mode}") + + n_items = len(sorted_data) + if n_items == 0 or num_sections == 1 or transition_ratio == 0: + return list(sorted_data) + + split_points = [round(n_items * section / num_sections) for section in range(1, num_sections)] + radius = round(n_items * transition_ratio) + ordered: List[dict] = [] + cursor = 0 + + for split_point in split_points: + transition_start = max(cursor, split_point - radius) + transition_end = min(n_items, split_point + radius) + + if cursor < transition_start: + ordered.extend(sorted_data[cursor:transition_start]) + + transition = list(sorted_data[transition_start:transition_end]) + if mode == "folding": + ordered.extend(folding_order(transition, folding_layer)) + else: + ordered.extend(zigzag_order(transition, folding_layer)) + cursor = transition_end + + if cursor < n_items: + ordered.extend(sorted_data[cursor:n_items]) + + return ordered + + +def ensure_permutation(input_data: Sequence[dict], output_data: Sequence[dict]) -> None: + if len(input_data) != len(output_data): + raise ValueError(f"Ordering changed data size from {len(input_data)} to {len(output_data)}") diff --git a/data_ordering/config/folding.yaml b/data_ordering/config/folding.yaml index dd353d9..52f5f3c 100644 --- a/data_ordering/config/folding.yaml +++ b/data_ordering/config/folding.yaml @@ -2,5 +2,8 @@ name: folding version: 1.0 description: Config of folding method in data ordering. -score_field: score -folding_layer: 3 +score_field: average_test_score +folding_layer: 5 +window_size: 0 +seed: 42 +ascending: true diff --git a/data_ordering/config/saw.yaml b/data_ordering/config/saw.yaml new file mode 100644 index 0000000..27b86e3 --- /dev/null +++ b/data_ordering/config/saw.yaml @@ -0,0 +1,12 @@ +name: saw ordering +version: 1.0 +description: Config of section (hybrid sorting/folding) method in data ordering. + + +score_field: average_test_score +ascending: true +folding_layer: 2 +num_sections: 3 +folding_ratio: 0.075 +window_size: 0 +seed: 42 diff --git a/data_ordering/config/segment.yaml b/data_ordering/config/segment.yaml new file mode 100644 index 0000000..935847f --- /dev/null +++ b/data_ordering/config/segment.yaml @@ -0,0 +1,13 @@ +name: segment +version: 1.0 +description: Config of segment-based ordering method. + + +score_field: average_test_score + +x_pct: 10 +y_pct: 10 +front_is_high: false +back_is_high: true + +seed: 42 diff --git a/data_ordering/config/shuffle.yaml b/data_ordering/config/shuffle.yaml index e01b190..69b4109 100644 --- a/data_ordering/config/shuffle.yaml +++ b/data_ordering/config/shuffle.yaml @@ -2,5 +2,5 @@ name: shuffle version: 1.0 description: Config of shuffle method in data ordering. -score_field: score +score_field: average_test_score seed: 10 diff --git a/data_ordering/config/sorting.yaml b/data_ordering/config/sorting.yaml index 03d903f..1cce381 100644 --- a/data_ordering/config/sorting.yaml +++ b/data_ordering/config/sorting.yaml @@ -1,6 +1,10 @@ name: sorting -version: 1.0 -description: Config of sorting method in data ordering. +version: 1.1 +description: Config of sorting method with local window shuffling. -score_field: score +score_field: average_test_score ascending: true +use_gumbel: false +temperature: 0 +window_size: 0 +seed: 42 diff --git a/data_ordering/config/stair.yaml b/data_ordering/config/stair.yaml new file mode 100644 index 0000000..8fa916b --- /dev/null +++ b/data_ordering/config/stair.yaml @@ -0,0 +1,12 @@ +name: stair ordering +version: 1.0 +description: Config of section (hybrid sorting/folding) method in data ordering. + + +score_field: average_test_score +ascending: true +folding_layer: 3 +num_sections: 2 +folding_ratio: 0.225 +window_size: 0 +seed: 42 diff --git a/data_ordering/config/zigzag.yaml b/data_ordering/config/zigzag.yaml new file mode 100644 index 0000000..bf2ee35 --- /dev/null +++ b/data_ordering/config/zigzag.yaml @@ -0,0 +1,11 @@ +name: zigzag +version: 2.0 +description: Config of zigzag method in data ordering. + +score_field: average_test_score +zigzag_layer: 5 +use_gumbel: false +temperature: 0 +seed: 42 +window_size: 0 +ascending: true diff --git a/data_ordering/entry.py b/data_ordering/entry.py index 5db62a7..d377f97 100644 --- a/data_ordering/entry.py +++ b/data_ordering/entry.py @@ -1,42 +1,56 @@ -import os -import sys -sys.path.insert(0, os.getcwd()) import argparse -import shuffle -import sorting -import folding -from utils import load_yaml, load_jsonl, add_args, write_jsonl - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Data ordering.") - parser.add_argument("--input_data_path", type=str, help="Path to the input .jsonl file.") - parser.add_argument("--output_data_path", type=str, help="Path to the output .jsonl file.") - parser.add_argument("--method", type=str, choices=["shuffle", "sorting", "folding"], default="folding", - help="Ordering method: 'shuffle', 'sorting', and 'folding'. Defaults to 'folding'.") - parser.add_argument("--config_path", type=str, default="./config/folding.yaml", help="Config file for additional parameters (YAML format).") +try: + from . import folding, saw, segment, shuffle, sorting, stair, zigzag + from .common import add_config_args, ensure_permutation, load_jsonl, load_yaml, write_jsonl +except ImportError: + import folding + import saw + import segment + import shuffle + import sorting + import stair + import zigzag + from common import add_config_args, ensure_permutation, load_jsonl, load_yaml, write_jsonl + + +METHODS = { + "shuffle": shuffle.order, + "sorting": sorting.order, + "folding": folding.order, + "zigzag": zigzag.order, + "segment": segment.order, + "stair": stair.order, + "saw": saw.order, +} + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Order a scored JSONL dataset.") + parser.add_argument("--input_data_path", required=True, type=str, help="Path to the input .jsonl file.") + parser.add_argument("--output_data_path", required=True, type=str, help="Path to the output .jsonl file.") + parser.add_argument("--method", choices=sorted(METHODS), default="folding", help="Data ordering method.") + parser.add_argument("--config_path", default="data_ordering/config/folding.yaml", type=str, help="YAML config path.") + return parser + + +def main() -> None: + parser = build_parser() args = parser.parse_args() + args = add_config_args(args, load_yaml(args.config_path)) - args = add_args(args, load_yaml(args.config_path)) - - print(f" Arguments received:") - print(f" Input data path: {args.input_data_path}") - print(f" Selection method: {args.method}") - print(f" Score field: {args.score_field}") + print("Arguments received:") + print(f" input_data_path: {args.input_data_path}") + print(f" output_data_path: {args.output_data_path}") + print(f" method: {args.method}") + print(f" score_field: {args.score_field}") in_data = load_jsonl(args.input_data_path) - if args.method == "shuffle": - out_data = shuffle.order(in_data, args) - print(f" Random seed: {args.seed}") - - if args.method == "sorting": - out_data = sorting.order(in_data, args) - print(f" Ascending: {args.ascending}") - - if args.method == "folding": - out_data = folding.order(in_data, args) - print(f" Folding layer: {args.folding_layer}") + out_data = METHODS[args.method](in_data, args) + ensure_permutation(in_data, out_data) + write_jsonl(args.output_data_path, out_data) + print(f"Wrote {len(out_data)} records.") - write_jsonl(args.output_data_path, out_data) +if __name__ == "__main__": + main() diff --git a/data_ordering/entry.sh b/data_ordering/entry.sh index c8bcc85..5f149c4 100644 --- a/data_ordering/entry.sh +++ b/data_ordering/entry.sh @@ -1,12 +1,14 @@ #!/bin/bash +set -euo pipefail INPUT_DATA_PATH=${1-"./data/selected_data.jsonl"} OUTPUT_DATA_PATH=${2-"./data/ordered_data.jsonl"} -METHOD=${3-"folding"} +METHOD=${3-"folding"} CONFIG_PATH=${4-"./data_ordering/config/folding.yaml"} +PYTHON_BIN=${PYTHON:-python3} -python data_ordering/entry.py \ - --input_data_path $INPUT_DATA_PATH \ - --output_data_path $OUTPUT_DATA_PATH \ - --method $METHOD \ - --config_path $CONFIG_PATH \ +"$PYTHON_BIN" data_ordering/entry.py \ + --input_data_path "$INPUT_DATA_PATH" \ + --output_data_path "$OUTPUT_DATA_PATH" \ + --method "$METHOD" \ + --config_path "$CONFIG_PATH" \ diff --git a/data_ordering/folding.py b/data_ordering/folding.py index c28afed..1c77e12 100644 --- a/data_ordering/folding.py +++ b/data_ordering/folding.py @@ -1,12 +1,16 @@ +try: + from .common import folding_order, sort_data, window_based_shuffle +except ImportError: + from common import folding_order, sort_data, window_based_shuffle + + def order(in_data, args): score_field = args.score_field - layers = args.folding_layer + folding_layer = args.folding_layer + ascending = getattr(args, "ascending", True) + window_size = getattr(args, "window_size", 0) + seed = getattr(args, "seed", 42) - # folding order. - sorted_data = sorted(in_data, key=lambda x: x[score_field], reverse=False) - - out_data = list() - for l in range(layers): - sub_data = [sorted_data[i] for i in range(len(sorted_data)) if i % layers == l] - out_data.extend(sub_data) - return out_data + sorted_data = sort_data(in_data, score_field, ascending=ascending) + out_data = folding_order(sorted_data, folding_layer) + return window_based_shuffle(out_data, window_size=window_size, seed=seed) diff --git a/data_ordering/saw.py b/data_ordering/saw.py new file mode 100644 index 0000000..4aab220 --- /dev/null +++ b/data_ordering/saw.py @@ -0,0 +1,24 @@ +try: + from .common import cross_guidance_order, sort_data, window_based_shuffle +except ImportError: + from common import cross_guidance_order, sort_data, window_based_shuffle + + +def order(in_data, args): + score_field = args.score_field + ascending = getattr(args, "ascending", True) + num_sections = args.num_sections + folding_ratio = args.folding_ratio + folding_layer = args.folding_layer + window_size = getattr(args, "window_size", 0) + seed = getattr(args, "seed", 42) + + sorted_data = sort_data(in_data, score_field, ascending=ascending) + out_data = cross_guidance_order( + sorted_data, + num_sections=num_sections, + transition_ratio=folding_ratio, + folding_layer=folding_layer, + mode="zigzag", + ) + return window_based_shuffle(out_data, window_size=window_size, seed=seed) diff --git a/data_ordering/segment.py b/data_ordering/segment.py new file mode 100644 index 0000000..69f2144 --- /dev/null +++ b/data_ordering/segment.py @@ -0,0 +1,59 @@ +import random +import warnings + +try: + from .common import sort_data +except ImportError: + from common import sort_data + + +def _take_boundary(sorted_data, n_items, high): + if n_items <= 0: + return [], list(sorted_data) + if high: + return list(sorted_data[-n_items:]), list(sorted_data[:-n_items]) + return list(sorted_data[:n_items]), list(sorted_data[n_items:]) + + +def order(in_data, args): + """Segment Ordering (SEG). + + The front and back segments draw from low-score or high-score boundaries, + while the remaining samples form the middle segment. Each segment is + shuffled independently. + """ + score_field = args.score_field + total_samples = len(in_data) + seed = getattr(args, "seed", 42) + rng = random.Random(seed) + + sorted_data = sort_data(in_data, score_field, ascending=True) + n_front = int(total_samples * args.x_pct // 100) + n_back = int(total_samples * args.y_pct // 100) + total_selected = n_front + n_back + + if total_selected > total_samples: + ratio = total_samples / total_selected + n_front = int(n_front * ratio) + n_back = total_samples - n_front + warnings.warn( + f"x_pct + y_pct exceeds 100; resized to {n_front} front and {n_back} back samples.", + RuntimeWarning, + ) + + front_is_high = bool(args.front_is_high) + back_is_high = bool(args.back_is_high) + + if front_is_high == back_is_high: + selected, middle = _take_boundary(sorted_data, n_front + n_back, high=front_is_high) + rng.shuffle(selected) + front = selected[:n_front] + back = selected[n_front:] + else: + front, remaining = _take_boundary(sorted_data, n_front, high=front_is_high) + back, middle = _take_boundary(remaining, n_back, high=back_is_high) + + rng.shuffle(front) + rng.shuffle(middle) + rng.shuffle(back) + return front + middle + back diff --git a/data_ordering/shuffle.py b/data_ordering/shuffle.py index c0bc5a3..93c5d41 100644 --- a/data_ordering/shuffle.py +++ b/data_ordering/shuffle.py @@ -1,6 +1,9 @@ import random + def order(in_data, args): - random.seed(args.seed) - out_data = random.sample(in_data, len(in_data)) + seed = getattr(args, "seed", 42) + rng = random.Random(seed) + out_data = list(in_data) + rng.shuffle(out_data) return out_data diff --git a/data_ordering/sorting.py b/data_ordering/sorting.py index 18b3142..e0ce776 100644 --- a/data_ordering/sorting.py +++ b/data_ordering/sorting.py @@ -1,11 +1,18 @@ +try: + from .common import gumbel_rank_jitter, sorted_indices, window_based_shuffle +except ImportError: + from common import gumbel_rank_jitter, sorted_indices, window_based_shuffle + + def order(in_data, args): score_field = args.score_field + ascending = getattr(args, "ascending", True) + tau = getattr(args, "temperature", 1.0) + use_gumbel = getattr(args, "use_gumbel", False) + seed = getattr(args, "seed", 42) + window_size = getattr(args, "window_size", 0) - if args.ascending: - # ascending order. - out_data = sorted(in_data, key=lambda x: x[score_field], reverse=False) - else: - # descending order. - out_data = sorted(in_data, key=lambda x: x[score_field], reverse=True) - - return out_data + indices = sorted_indices(in_data, score_field, ascending=ascending) + indices = gumbel_rank_jitter(indices, tau=tau, use_gumbel=use_gumbel, seed=seed) + sorted_data = [in_data[idx] for idx in indices] + return window_based_shuffle(sorted_data, window_size=window_size, seed=seed) diff --git a/data_ordering/stair.py b/data_ordering/stair.py new file mode 100644 index 0000000..3010f60 --- /dev/null +++ b/data_ordering/stair.py @@ -0,0 +1,24 @@ +try: + from .common import cross_guidance_order, sort_data, window_based_shuffle +except ImportError: + from common import cross_guidance_order, sort_data, window_based_shuffle + + +def order(in_data, args): + score_field = args.score_field + ascending = getattr(args, "ascending", True) + num_sections = args.num_sections + folding_ratio = args.folding_ratio + folding_layer = args.folding_layer + window_size = getattr(args, "window_size", 0) + seed = getattr(args, "seed", 42) + + sorted_data = sort_data(in_data, score_field, ascending=ascending) + out_data = cross_guidance_order( + sorted_data, + num_sections=num_sections, + transition_ratio=folding_ratio, + folding_layer=folding_layer, + mode="folding", + ) + return window_based_shuffle(out_data, window_size=window_size, seed=seed) diff --git a/data_ordering/zigzag.py b/data_ordering/zigzag.py new file mode 100644 index 0000000..d75dcb1 --- /dev/null +++ b/data_ordering/zigzag.py @@ -0,0 +1,16 @@ +try: + from .common import sort_data, window_based_shuffle, zigzag_order +except ImportError: + from common import sort_data, window_based_shuffle, zigzag_order + + +def order(in_data, args): + score_field = args.score_field + zigzag_layer = args.zigzag_layer + ascending = getattr(args, "ascending", True) + window_size = getattr(args, "window_size", 0) + seed = getattr(args, "seed", 42) + + sorted_data = sort_data(in_data, score_field, ascending=ascending) + out_data = zigzag_order(sorted_data, zigzag_layer) + return window_based_shuffle(out_data, window_size=window_size, seed=seed) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..c4854dd --- /dev/null +++ b/docs/README.md @@ -0,0 +1,19 @@ +# Documentation + +This folder keeps paper-specific documentation for the Data Efficacy codebase. The root README introduces the shared motivation and pipeline, while each page here records the details needed for a specific paper. + +## Papers + +| Work | Status | Page | +| --- | --- | --- | +| Data Efficacy for Language Model Training (DELT) | arXiv 2025 | [Data Efficacy for Language Model Training](./data_efficacy_for_language_model_training.md) | +| Demystifying Data Organization for Enhanced LLM Training | ACL 2026 | [Demystifying Data Organization for Enhanced LLM Training](./demystifying_data_organization_for_enhanced_llm_training.md) | + +## Adding New Work + +When adding a new data efficacy paper: + +- Add a new page under `docs/`. +- Put paper-specific figures under `docs/assets//`. +- Link the page from both this index and the root [README](../README.md). +- Keep reusable code in the shared modules, such as `data_scoring`, `data_selection`, `data_ordering`, `model_train`, or `model_eval`. diff --git a/docs/assets/data_organization_acl2026/dclm_loss.png b/docs/assets/data_organization_acl2026/dclm_loss.png new file mode 100644 index 0000000..8f0b77d Binary files /dev/null and b/docs/assets/data_organization_acl2026/dclm_loss.png differ diff --git a/docs/assets/data_organization_acl2026/order_patterns.png b/docs/assets/data_organization_acl2026/order_patterns.png new file mode 100644 index 0000000..8b84678 Binary files /dev/null and b/docs/assets/data_organization_acl2026/order_patterns.png differ diff --git a/docs/assets/data_organization_acl2026/overview.png b/docs/assets/data_organization_acl2026/overview.png new file mode 100644 index 0000000..1ffe5ec Binary files /dev/null and b/docs/assets/data_organization_acl2026/overview.png differ diff --git a/docs/data_efficacy_for_language_model_training.md b/docs/data_efficacy_for_language_model_training.md new file mode 100644 index 0000000..5a6e6c6 --- /dev/null +++ b/docs/data_efficacy_for_language_model_training.md @@ -0,0 +1,81 @@ +# Data Efficacy for Language Model Training + +

+ Task + Paper + License +

+ +

+ [Paper] β€’ + [HF Model] +

+ +This page documents **Data Efficacy for Language Model Training (DELT)**, the original work supported by this repository. For the repository-wide overview and common running commands, see the root [README](../README.md). + +DELT studies how pre-computed sample-level scores can be reused across **Data Scoring**, **Data Selection**, and **Data Ordering** to improve language model training. + +![DELT results](../figures/fig1_result.jpg) + +## Contributions + +DELT frames data curation as a data efficacy problem: once sample-level scores are available, they should guide more than a one-time filtering decision. + +The paper introduces a connected pipeline with three core components: + +- **Learnability-Quality Scoring (LQS)** scores each sample by considering both learnability and quality from the gradient consistency perspective. +- **Score-based Data Selection** constructs training subsets from the scored corpus under a data budget. +- **Folding Ordering (FO)** reuses the same scores to organize selected data before training, mitigating forgetting and distribution bias during one-pass training. + +![DELT paradigm](../figures/data_efficacy_paradigm.png) + +## Learnability-Quality Scoring + +LQS is the data scoring component of DELT. It estimates sample-level utility by combining learnability and quality signals, so the score can later be reused by both selection and ordering. + +![Learnability-Quality Scoring](../figures/fig2_score.jpg) + +Implementation entry point: + +```bash +bash data_scoring/entry.sh \ + data/source-cc-1b.jsonl \ + data/source-cc-1b_scored-lqs.jsonl \ + lqs \ + data_scoring/config/lqs.yaml +``` + +## Folding Ordering + +FO is the data ordering component introduced by DELT. It orders selected samples according to their scores with a folding pattern, so training can revisit different score regions and reduce distribution bias. + +![Folding Ordering](../figures/fig3_order.jpg) + +Implementation entry point: + +```bash +bash data_ordering/entry.sh \ + data/source-cc-1b_scored-lqs_selected-r1.0.jsonl \ + data/source-cc-1b_scored-lqs_selected-r1.0_ordered-folding-l3.jsonl \ + folding \ + data_ordering/config/folding.yaml +``` + +## Modules + +- [data_scoring](../data_scoring): LQS and KenLM-based scoring. +- [data_selection](../data_selection): top-r, top-k, and threshold selection. +- [data_ordering](../data_ordering): score-based ordering, including DELT's Folding Ordering. +- [model_train](../model_train): pre-training. +- [model_eval](../model_eval): evaluation. + +## Citation + +```bibtex +@article{dai2025data, + title={Data Efficacy for Language Model Training}, + author={Yalun Dai and Yangyu Huang and Xin Zhang and Wenshan Wu and Chong Li and Wenhui Lu and Shijie Cao and Li Dong and Scarlett Li}, + journal={arXiv preprint arXiv:2506.21545}, + year={2025} +} +``` diff --git a/docs/demystifying_data_organization_for_enhanced_llm_training.md b/docs/demystifying_data_organization_for_enhanced_llm_training.md new file mode 100644 index 0000000..14ea731 --- /dev/null +++ b/docs/demystifying_data_organization_for_enhanced_llm_training.md @@ -0,0 +1,166 @@ +# Demystifying Data Organization for Enhanced LLM Training + +

+ Task + Venue + License +

+ +

+ [Paper] β€’ + [HF Model] +

+ +This page introduces the code and usage for our ACL 2026 work, **"Demystifying Data Organization for Enhanced LLM Training"**. The code is implemented as an extension of DELT's **Data Ordering** stage, so it can reuse the same scored or selected JSONL data produced by the original DELT pipeline. + +![Overview of data organization guidances and scaling results](assets/data_organization_acl2026/overview.png) + +## Overview + +DELT shows that pre-computed sample-level scores can guide data selection and ordering. This follow-up focuses on a narrower question: once each sample already has a score, how should the training data be organized? + +The paper identifies four guidances for data organization: + +- **Boundary Sharpening**: control the score distribution at the beginning and end of training. +- **Cyclic Scheduling**: periodically revisit data across the score spectrum during one-pass training. +- **Curriculum Continuity**: avoid abrupt score jumps that can shock the optimizer. +- **Local Diversity**: keep enough heterogeneity in local windows or mini-batches. + +The implementation in this folder adds ordering strategies that instantiate these guidances while keeping the input and output format compatible with DELT. + +## Methods + +Supported ordering methods: + +- `shuffle`: random ordering baseline. +- `sorting`: score sorting / curriculum learning baseline. +- `folding`: Folding Ordering (FO), inherited from DELT. +- `zigzag`: Zig-zag Ordering (ZIG), which reverses odd FO layers to improve curriculum continuity. +- `segment`: Segment Ordering (SEG), used for boundary sharpening. +- `stair`: Stair Ordering (STR), which applies FO in local transition regions. +- `saw`: Saw Ordering (SAW), which applies ZIG in local transition regions. + +Set `window_size > 1` in any method config to apply Jittering Ordering (JIT), which shuffles samples inside local windows while preserving the global trend. + +![Score-index distributions under different data organization strategies](assets/data_organization_acl2026/order_patterns.png) + +## Datasets + +For the general-domain pre-training experiments, we use sampled subsets from public scored corpora on Hugging Face: + +- [FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu), which provides educational-quality web data from FineWeb. +- [QuRatedPajama-260B](https://huggingface.co/datasets/princeton-nlp/QuRatedPajama-260B), which provides SlimPajama-based data annotated with quality ratings. + +After sampling and preprocessing, save the data as JSONL with a sample-level score field, then pass it to the ordering scripts. The `score_field` used by the ordering methods can be configured in each YAML file under [data_ordering/config](../data_ordering/config). + +You can use the existing loader in `utils.py` to materialize sampled JSONL files: + +```bash +# FineWeb-Edu sample +python utils.py \ + --content dataset \ + --id HuggingFaceFW/fineweb-edu \ + --data-name sample-10BT \ + --save-dir data/fineweb_edu_sample.jsonl \ + --split-name train \ + --sample-size + +# QuRatedPajama sample +python utils.py \ + --content dataset \ + --id princeton-nlp/QuRatedPajama-260B \ + --save-dir data/quratedpajama_sample.jsonl \ + --split-name train \ + --sample-size +``` + +Here `--sample-size` is the number of examples loaded by `utils.py`. If an experiment uses a token-based budget, perform token-based sampling or truncation during preprocessing before running the ordering stage. + +## Data Format + +The input is a JSONL file. Each row must contain the configured score field, whose default name is `average_test_score`. + +```json +{"text": "sample text", "average_test_score": 3.7} +{"text": "another sample", "average_test_score": 1.2} +``` + +The output is another JSONL file with the same records in a new order. The ordering scripts do not change the dataset size or sample content. + +## Usage + +Run SAW: + +```bash +bash data_ordering/entry.sh \ + data/scored_data.jsonl \ + data/ordered_saw.jsonl \ + saw \ + data_ordering/config/saw.yaml +``` + +Run STR: + +```bash +bash data_ordering/entry.sh \ + data/scored_data.jsonl \ + data/ordered_stair.jsonl \ + stair \ + data_ordering/config/stair.yaml +``` + +Run a baseline: + +```bash +bash data_ordering/entry.sh \ + data/scored_data.jsonl \ + data/ordered_folding.jsonl \ + folding \ + data_ordering/config/folding.yaml +``` + +Equivalent Python entry: + +```bash +python3 data_ordering/entry.py \ + --input_data_path data/scored_data.jsonl \ + --output_data_path data/ordered_saw.jsonl \ + --method saw \ + --config_path data_ordering/config/saw.yaml +``` + +## Configuration + +Configs are stored in [data_ordering/config](../data_ordering/config). + +Common fields: + +- `score_field`: JSONL field containing the sample score. +- `ascending`: whether lower-score samples appear earlier before method-specific reordering. +- `seed`: random seed for shuffle and JIT. +- `window_size`: local JIT window. Set `0` to disable JIT. + +Method-specific fields: + +- `folding_layer`: number of FO layers for `folding`, `stair`, and `saw`. +- `zigzag_layer`: number of ZIG layers for `zigzag`. +- `num_sections`: number of global sections for STR/SAW. +- `folding_ratio`: transition-region radius as a fraction of the dataset size for STR/SAW. +- `x_pct`, `y_pct`, `front_is_high`, `back_is_high`: boundary settings for SEG. + +## Scaling Results + +STR and SAW are designed to preserve the benefits of score-based curricula while improving stability and diversity. In the paper, they consistently improve language modeling loss over random ordering across model scales. + +![DCLM test losses across model sizes](assets/data_organization_acl2026/dclm_loss.png) + +## Citation + +```bibtex +@inproceedings{dai2026demystifying, + title={Demystifying Data Organization for Enhanced LLM Training}, + author={Yalun Dai and Yangyu Huang and Tongshen Yang and Yonghan Wang and Xin Zhang and Wenshan Wu and Qihao Zhao and Hao Li and Yuanyuan Gao and Kim-Hui Yap and Scarlett Li}, + booktitle={Proceedings of the Annual Meeting of the Association for Computational Linguistics}, + year={2026} +} +``` diff --git a/model_eval/config/all.yaml b/model_eval/config/all.yaml index 0913f30..5b04e01 100644 --- a/model_eval/config/all.yaml +++ b/model_eval/config/all.yaml @@ -1,25 +1,25 @@ -name: all -version: 1.0 -description: Config of all benchmark in model evaluation. - -device: cuda:0 -model_format: hf -batch_size: 8 -seed: 10 -add_bos_token: true -# auto, float, half -dtype: auto -tasks: - - hellaswag - - sciq - - arc_easy - - arc_challenge - - openbookqa - - piqa - - winogrande - - lambada_openai - - gsm8k - - mathqa - - gpqa_diamond_zeroshot - - humaneval - - mbpp +name: all +version: 1.0 +description: Config of all benchmark in model evaluation. + +device: cuda:0 +model_format: hf +batch_size: 8 +seed: 10 +add_bos_token: true +# auto, float, half +dtype: auto +tasks: + - hellaswag + - sciq + - arc_easy + - arc_challenge + - openbookqa + - piqa + - winogrande + - lambada_openai + - gsm8k + - mathqa + - gpqa_diamond_zeroshot + - humaneval + - mbpp diff --git a/model_eval/config/code.yaml b/model_eval/config/code.yaml index 036f35b..49bbbdb 100644 --- a/model_eval/config/code.yaml +++ b/model_eval/config/code.yaml @@ -1,14 +1,14 @@ -name: code -version: 1.0 -description: Config of code benchmark in model evaluation. - -device: cuda:0 -model_format: hf -batch_size: 1 -seed: 10 -add_bos_token: true -# auto, float, half -dtype: auto -tasks: - - humaneval - - mbpp +name: code +version: 1.0 +description: Config of code benchmark in model evaluation. + +device: cuda:0 +model_format: hf +batch_size: 1 +seed: 10 +add_bos_token: true +# auto, float, half +dtype: auto +tasks: + - humaneval + - mbpp diff --git a/model_eval/config/general.yaml b/model_eval/config/general.yaml index 35ee708..e5f9fc6 100644 --- a/model_eval/config/general.yaml +++ b/model_eval/config/general.yaml @@ -1,20 +1,20 @@ -name: general -version: 1.0 -description: Config of general benchmark in model evaluation. - -device: cuda:0 -model_format: hf -batch_size: 64 -seed: 10 -add_bos_token: true -# auto, float, half -dtype: auto -tasks: - - hellaswag - - sciq - - arc_easy - - arc_challenge - - openbookqa - - piqa - - winogrande - - lambada_openai +name: general +version: 1.0 +description: Config of general benchmark in model evaluation. + +device: cuda:0 +model_format: hf +batch_size: 64 +seed: 10 +add_bos_token: true +# auto, float, half +dtype: auto +tasks: + - hellaswag + - sciq + - arc_easy + - arc_challenge + - openbookqa + - piqa + - winogrande + - lambada_openai diff --git a/model_eval/config/math.yaml b/model_eval/config/math.yaml index 54a4d22..fa79f48 100644 --- a/model_eval/config/math.yaml +++ b/model_eval/config/math.yaml @@ -1,15 +1,15 @@ -name: math -version: 1.0 -description: Config of math benchmark in model evaluation. - -device: cuda:0 -model_format: hf -batch_size: 8 -seed: 10 -add_bos_token: true -# auto, float, half -dtype: auto -tasks: - - gsm8k - - mathqa - - gpqa_diamond_zeroshot +name: math +version: 1.0 +description: Config of math benchmark in model evaluation. + +device: cuda:0 +model_format: hf +batch_size: 8 +seed: 10 +add_bos_token: true +# auto, float, half +dtype: auto +tasks: + - gsm8k + - mathqa + - gpqa_diamond_zeroshot diff --git a/model_eval/entry.py b/model_eval/entry.py index 8347767..ec85381 100644 --- a/model_eval/entry.py +++ b/model_eval/entry.py @@ -1,25 +1,25 @@ -import os -import sys -sys.path.insert(0, os.getcwd()) -import json -import argparse -import lm_evaluation_harness -from utils import load_yaml, write_yaml, add_args - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Model evaluation.") - parser.add_argument("--input_model_path", type=str, required=True, help="The path of model to be evaluated.") - parser.add_argument("--output_result_path", type=str, required=True, help="The path of result.") - parser.add_argument("--method", type=str, choices=["lm_evaluation_harness"], default="lm_evaluation_harness", - help="Evaluation method: 'lm_evaluation_harness'. Defaults to 'lm_evaluation_harness'.") - parser.add_argument("--config_path", type=str, default="./config/general.yaml", help="Config file for additional parameters (YAML format).") - - args = parser.parse_args() - args = add_args(args, load_yaml(args.config_path)) - - if args.method == "lm_evaluation_harness": - out_result = lm_evaluation_harness.eval(args.input_model_path, args) - - print(f"The evaluation results are saved to {args.output_result_path}\n{json.dumps(out_result, indent=4)}") - write_yaml(args.output_result_path, out_result) +import os +import sys +sys.path.insert(0, os.getcwd()) +import json +import argparse +import lm_evaluation_harness +from utils import load_yaml, write_yaml, add_args + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Model evaluation.") + parser.add_argument("--input_model_path", type=str, required=True, help="The path of model to be evaluated.") + parser.add_argument("--output_result_path", type=str, required=True, help="The path of result.") + parser.add_argument("--method", type=str, choices=["lm_evaluation_harness"], default="lm_evaluation_harness", + help="Evaluation method: 'lm_evaluation_harness'. Defaults to 'lm_evaluation_harness'.") + parser.add_argument("--config_path", type=str, default="./config/general.yaml", help="Config file for additional parameters (YAML format).") + + args = parser.parse_args() + args = add_args(args, load_yaml(args.config_path)) + + if args.method == "lm_evaluation_harness": + out_result = lm_evaluation_harness.eval(args.input_model_path, args) + + print(f"The evaluation results are saved to {args.output_result_path}\n{json.dumps(out_result, indent=4)}") + write_yaml(args.output_result_path, out_result) diff --git a/model_eval/entry.sh b/model_eval/entry.sh index f390101..2d26f2d 100644 --- a/model_eval/entry.sh +++ b/model_eval/entry.sh @@ -1,12 +1,12 @@ -#!/bin/bash - -INPUT_MODEL_PATH=${1-"./model/output_model"} -OUTPUT_RESULT_PATH=${2-"./result/general.jsonl"} -METHOD=${3-"lm_evaluation_harness"} -CONFIG_PATH=${4-"./model_eval/config/general.yaml"} - -python model_eval/entry.py \ - --input_model_path $INPUT_MODEL_PATH \ - --output_result_path $OUTPUT_RESULT_PATH \ - --method $METHOD \ - --config $CONFIG_PATH \ +#!/bin/bash + +INPUT_MODEL_PATH=${1-"./model/output_model"} +OUTPUT_RESULT_PATH=${2-"./result/general.jsonl"} +METHOD=${3-"lm_evaluation_harness"} +CONFIG_PATH=${4-"./model_eval/config/general.yaml"} + +python model_eval/entry.py \ + --input_model_path $INPUT_MODEL_PATH \ + --output_result_path $OUTPUT_RESULT_PATH \ + --method $METHOD \ + --config $CONFIG_PATH \ diff --git a/model_eval/lm_evaluation_harness.py b/model_eval/lm_evaluation_harness.py index 5d16f42..91aeb9a 100644 --- a/model_eval/lm_evaluation_harness.py +++ b/model_eval/lm_evaluation_harness.py @@ -1,37 +1,37 @@ -import os -import lm_eval - -os.environ["HF_ALLOW_CODE_EVAL"] = "1" -os.environ["TOKENIZERS_PARALLELISM"] = "false" - - -def calc_ave_acc(out_result): - # Don't change the order of this list. - metric_list = ("acc_norm,none", "acc,none", "pass@1,create_test", "pass_at_1,none", "exact_match,strict-match", ) - for ben_name, result in out_result.items(): - for metric_name in metric_list: - if metric_name in result: - out_result[ben_name] = round(result[metric_name], 6) - break - out_result["summary_average"] = round(sum(out_result.values()) / max(len(out_result), 1), 6) - return out_result - - -def eval(model_path, args): - model_args = {"pretrained": model_path, "add_bos_token": args.add_bos_token, "dtype": args.dtype} - results = lm_eval.simple_evaluate( - model=args.model_format, - model_args=model_args, - tasks=args.tasks, - device=args.device, - batch_size=args.batch_size, - random_seed=args.seed, - numpy_random_seed=args.seed, - torch_random_seed=args.seed, - fewshot_random_seed=args.seed, - confirm_run_unsafe_code=True, - #num_fewshot=0, - #limit=5, - ) - out_result = calc_ave_acc(results["results"]) - return out_result +import os +import lm_eval + +os.environ["HF_ALLOW_CODE_EVAL"] = "1" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +def calc_ave_acc(out_result): + # Don't change the order of this list. + metric_list = ("acc_norm,none", "acc,none", "pass@1,create_test", "pass_at_1,none", "exact_match,strict-match", ) + for ben_name, result in out_result.items(): + for metric_name in metric_list: + if metric_name in result: + out_result[ben_name] = round(result[metric_name], 6) + break + out_result["summary_average"] = round(sum(out_result.values()) / max(len(out_result), 1), 6) + return out_result + + +def eval(model_path, args): + model_args = {"pretrained": model_path, "add_bos_token": args.add_bos_token, "dtype": args.dtype} + results = lm_eval.simple_evaluate( + model=args.model_format, + model_args=model_args, + tasks=args.tasks, + device=args.device, + batch_size=args.batch_size, + random_seed=args.seed, + numpy_random_seed=args.seed, + torch_random_seed=args.seed, + fewshot_random_seed=args.seed, + confirm_run_unsafe_code=True, + #num_fewshot=0, + #limit=5, + ) + out_result = calc_ave_acc(results["results"]) + return out_result diff --git a/model_train/config/deepspeed.json b/model_train/config/deepspeed.json index ef24d49..7bd14fa 100644 --- a/model_train/config/deepspeed.json +++ b/model_train/config/deepspeed.json @@ -1,16 +1,16 @@ -{ - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "zero_optimization": { - "stage": 1 - }, - "zero_allow_untested_optimizer": true, - "fp16": { - "enabled": true, - "loss_scale": 0, - "initial_scale_power": 11, - "loss_scale_window": 2000, - "hysteresis": 4 - }, - "wall_clock_breakdown": false +{ + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 1 + }, + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": true, + "loss_scale": 0, + "initial_scale_power": 11, + "loss_scale_window": 2000, + "hysteresis": 4 + }, + "wall_clock_breakdown": false } \ No newline at end of file diff --git a/model_train/config/train.yaml b/model_train/config/train.yaml index f1ea2bf..e805fc8 100644 --- a/model_train/config/train.yaml +++ b/model_train/config/train.yaml @@ -1,71 +1,71 @@ -name: pre_train -version: 1.0 -description: Config of model training. - -# data -seed: 10 -order_seed: 10 -instruct_field: instruct -text_field: text -max_length: 1024 -truncation: true -# don't change the shuffle_data mode. -shuffle_data: false -train_num: null -data_split: null -min_offset: 0 -min_state: 0 -min_prompt_length: 128 -max_prompt_length: 512 -bin_data: false -json_data: true - -# model -model_type: mistral -from_scratch: true -attn_impl: eager -xops_attn: true -model_parallel: false -dropout_path_rate: null -gradient_checkpointing: false -#padding_side: left -#padding_side: right - -# training -do_train: true -batch_size: 8 -gradient_accumulation_steps: 4 -save_strategy: "steps" -save_steps: 10000 - -# optimizer -optimizer_name: adamw_torch -weight_decay: 0.01 -lr: 0.0006 -adam_beta: 0.9 -adam_beta2: 0.98 -adam_eps: 0.000001 - -# lr schedule -num_epochs: 1 -lr_scheduler_type: cosine_with_min_lr -warmup_iters: 2000 -lr_min: 0.00006 -# lr_scheduler_type: constant -# warmup_iters: 0 -# lr_min: 0.0006 - -# log -log_name: steps -log_interval: 10 -# none, azure_ml, comet_ml, mlflow, neptune, tensorboard, wandb, codecarbon, clearml, dagshub, flyte, dvclive -report_name: [] -#report_name: -# - wandb -#wandb_project_name: "delt" -log_level: passive - -# deepspeed -deepspeed: true -deepspeed_config: model_train/config/deepspeed.json -clip_grad: 1.0 +name: pre_train +version: 1.0 +description: Config of model training. + +# data +seed: 10 +order_seed: 10 +instruct_field: instruct +text_field: text +max_length: 1024 +truncation: true +# don't change the shuffle_data mode. +shuffle_data: false +train_num: null +data_split: null +min_offset: 0 +min_state: 0 +min_prompt_length: 128 +max_prompt_length: 512 +bin_data: false +json_data: true + +# model +model_type: mistral +from_scratch: true +attn_impl: eager +xops_attn: true +model_parallel: false +dropout_path_rate: null +gradient_checkpointing: false +#padding_side: left +#padding_side: right + +# training +do_train: true +batch_size: 8 +gradient_accumulation_steps: 4 +save_strategy: "steps" +save_steps: 10000 + +# optimizer +optimizer_name: adamw_torch +weight_decay: 0.01 +lr: 0.0006 +adam_beta: 0.9 +adam_beta2: 0.98 +adam_eps: 0.000001 + +# lr schedule +num_epochs: 1 +lr_scheduler_type: cosine_with_min_lr +warmup_iters: 2000 +lr_min: 0.00006 +# lr_scheduler_type: constant +# warmup_iters: 0 +# lr_min: 0.0006 + +# log +log_name: steps +log_interval: 10 +# none, azure_ml, comet_ml, mlflow, neptune, tensorboard, wandb, codecarbon, clearml, dagshub, flyte, dvclive +report_name: [] +#report_name: +# - wandb +#wandb_project_name: "delt" +log_level: passive + +# deepspeed +deepspeed: true +deepspeed_config: model_train/config/deepspeed.json +clip_grad: 1.0 diff --git a/model_train/entry.py b/model_train/entry.py index 24031b0..98f76bf 100644 --- a/model_train/entry.py +++ b/model_train/entry.py @@ -1,25 +1,25 @@ -import os -import sys -sys.path.insert(0, os.getcwd()) -import argparse -import deepspeed -from trainer import train -from utils import load_yaml, init, add_args - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Model training.") - parser.add_argument("--local_rank", type=int, help="Local rank for deepspeed.", default=0) - parser.add_argument("--data_path", type=str, required=True, help="The path of training data.") - parser.add_argument("--model_path", type=str, required=True, help="The input path of model.") - parser.add_argument("--save", type=str, required=True, help="The save path of model.") - parser.add_argument("--method", type=str, choices=["pretrain", "posttrain"], default="pretrain", help="Training type: 'pretrain' and 'posttrain'.") - parser.add_argument("--config_path", type=str, default="./model_train/config/train.yaml", help="Config file for additional parameters (YAML format).") - - args = parser.parse_args() - parser = deepspeed.add_config_arguments(parser) - args = add_args(args, load_yaml(args.config_path)) - - init(args) - if args.method in ["pretrain", "posttrain"]: - train(args) +import os +import sys +sys.path.insert(0, os.getcwd()) +import argparse +import deepspeed +from trainer import train +from utils import load_yaml, init, add_args + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Model training.") + parser.add_argument("--local_rank", type=int, help="Local rank for deepspeed.", default=0) + parser.add_argument("--data_path", type=str, required=True, help="The path of training data.") + parser.add_argument("--model_path", type=str, required=True, help="The input path of model.") + parser.add_argument("--save", type=str, required=True, help="The save path of model.") + parser.add_argument("--method", type=str, choices=["pretrain", "posttrain"], default="pretrain", help="Training type: 'pretrain' and 'posttrain'.") + parser.add_argument("--config_path", type=str, default="./model_train/config/train.yaml", help="Config file for additional parameters (YAML format).") + + args = parser.parse_args() + parser = deepspeed.add_config_arguments(parser) + args = add_args(args, load_yaml(args.config_path)) + + init(args) + if args.method in ["pretrain", "posttrain"]: + train(args) diff --git a/model_train/entry.sh b/model_train/entry.sh index 6ba2d1f..ed95111 100644 --- a/model_train/entry.sh +++ b/model_train/entry.sh @@ -1,34 +1,34 @@ -#!/bin/bash - -INPUT_DATA_PATH=${1-'./data/ordered_data.jsonl'} -INPUT_MODEL_PATH=${2-'./model/input_model'} -OUTPUT_MODEL_PATH=${3-'./model/output_model'} -METHOD=${4-'pretrain'} -CONFIG_PATH=${5-'train.yaml'} - -GPUS_PER_NODE=${6-1} -NNODES=${7-1} -#MASTER_PORT=${8-2030} -min_port=2048 -max_port=65535 -MASTER_PORT=$((RANDOM % (max_port - min_port + 1) + min_port)) - -DISTRIBUTED_ARGS="--num_gpus $GPUS_PER_NODE \ - --num_nodes $NNODES \ - --master_port $MASTER_PORT" - -export NCCL_DEBUG="" -export WANDB_DISABLED=True -export TF_CPP_MIN_LOG_LEVEL=3 -export OMP_NUM_THREADS=16 - -CMD="deepspeed ${DISTRIBUTED_ARGS} model_train/entry.py \ - --data_path ${INPUT_DATA_PATH} \ - --model_path ${INPUT_MODEL_PATH} \ - --save ${OUTPUT_MODEL_PATH} \ - --method ${METHOD} \ - --config_path ${CONFIG_PATH}" - -echo ${CMD} -mkdir -p ${OUTPUT_MODEL_PATH} -${CMD} +#!/bin/bash + +INPUT_DATA_PATH=${1-'./data/ordered_data.jsonl'} +INPUT_MODEL_PATH=${2-'./model/input_model'} +OUTPUT_MODEL_PATH=${3-'./model/output_model'} +METHOD=${4-'pretrain'} +CONFIG_PATH=${5-'train.yaml'} + +GPUS_PER_NODE=${6-1} +NNODES=${7-1} +#MASTER_PORT=${8-2030} +min_port=2048 +max_port=65535 +MASTER_PORT=$((RANDOM % (max_port - min_port + 1) + min_port)) + +DISTRIBUTED_ARGS="--num_gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --master_port $MASTER_PORT" + +export NCCL_DEBUG="" +export WANDB_DISABLED=True +export TF_CPP_MIN_LOG_LEVEL=3 +export OMP_NUM_THREADS=16 + +CMD="deepspeed ${DISTRIBUTED_ARGS} model_train/entry.py \ + --data_path ${INPUT_DATA_PATH} \ + --model_path ${INPUT_MODEL_PATH} \ + --save ${OUTPUT_MODEL_PATH} \ + --method ${METHOD} \ + --config_path ${CONFIG_PATH}" + +echo ${CMD} +mkdir -p ${OUTPUT_MODEL_PATH} +${CMD} diff --git a/model_train/trainer.py b/model_train/trainer.py index 2b4f919..89eb673 100644 --- a/model_train/trainer.py +++ b/model_train/trainer.py @@ -1,151 +1,151 @@ -import os -import copy -#import wandb -import torch -from utils import get_model, get_tokenizer -from datasets import load_dataset -from torch.utils.data import DataLoader -from accelerate.utils import DistributedType -from transformers import TrainingArguments, Trainer -#from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding - -os.environ["TOKENIZERS_PARALLELISM"] = "false" -IGNORE_INDEX = -100 - -class CustomTrainer(Trainer): - def get_train_dataloader(self): - dataloader_params = { - "batch_size": self._train_batch_size, - "collate_fn": self.data_collator, - "num_workers": self.args.dataloader_num_workers, - "pin_memory": self.args.dataloader_pin_memory, - "persistent_workers": self.args.dataloader_persistent_workers, - "drop_last": self.args.dataloader_drop_last, - "prefetch_factor": self.args.dataloader_prefetch_factor, - "shuffle": False, - #"sampler": self._get_train_sampler(), - #"worker_init_fn": seed_worker, - } - self.train_dataset = self._remove_unused_columns(self.train_dataset, description="training") - return self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params)) - - def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False): - outputs = model(**inputs) - losses = outputs.loss - return (losses, outputs) if return_outputs else losses - -def preprocess(tokenizer, examples, instruct_field, text_field, max_length=1024, padding="max_length"):# max_length, longest - instructs = examples.get(instruct_field, None) - texts = examples.get(text_field, None) - - input_ids_list = list() - labels_list = list() - attention_mask_list = list() - for i in range(len(texts)): - instruct = instructs[i] if isinstance(instructs, list) else "" - text = instruct + texts[i] - - tokenized_example = tokenizer(text, padding=padding, add_special_tokens=True, truncation=True, max_length=max_length) - input_ids = tokenized_example["input_ids"] - attention_mask = tokenized_example["attention_mask"] - labels = copy.deepcopy(input_ids) - - tokenized_instruct = tokenizer(instruct, padding=padding, add_special_tokens=True, truncation=True, max_length=max_length) - input_len = len(tokenized_instruct["input_ids"]) - tokenized_instruct["input_ids"].count(tokenizer.pad_token_id) - labels[:input_len] = [IGNORE_INDEX] * input_len - - input_ids_list.append(input_ids) - labels_list.append(labels) - attention_mask_list.append(attention_mask) - - return dict( - input_ids=input_ids_list, - labels=labels_list, - attention_mask=attention_mask_list, - ) - -def train(args): - # device. - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - print(f"device: {device}, cpu count: {os.cpu_count()}") - - # tokenizer. - # tokenizer = AutoTokenizer.from_pretrained(args.model_path) - tokenizer = get_tokenizer(args) - - # model. - # model = AutoModelForCausalLM.from_pretrained(args.model_path, torch_dtype="auto", trust_remote_code=True) - model = get_model(args, device=device) - model.train() - - # data. - dataset = load_dataset("json", data_files={"train": args.data_path}) - train_dataset = dataset["train"] - # For debug. - #train_dataset = train_dataset.select(range(100)) - - train_dataset = train_dataset.map( - lambda example: preprocess(tokenizer, example, args.instruct_field, args.text_field, args.max_length), - batched=True, num_proc=os.cpu_count(), load_from_cache_file=False) - #train_dataset = train_dataset.filter(lambda example: len(example["input_ids"]) > 0) - #if args.shuffle_data: - # train_dataset = train_dataset.shuffle(seed=args.seed) - - # train args. - #wandb.init(project=os.path.basename(args.save)) - training_args = TrainingArguments( - output_dir=args.save, - overwrite_output_dir=True, - fp16=True, - deepspeed=args.deepspeed_config, - lr_scheduler_kwargs={"min_lr": args.lr_min}, - #report_to="none", - save_strategy=args.save_strategy, - save_steps=args.save_steps, - save_total_limit=10, - ) - - training_args.set_training( - learning_rate=args.lr, - batch_size=args.batch_size, - weight_decay=args.weight_decay, - num_epochs=args.num_epochs, - gradient_accumulation_steps=args.gradient_accumulation_steps, - seed=args.seed, - gradient_checkpointing=args.gradient_checkpointing, - ) - - training_args.set_optimizer( - name=args.optimizer_name, - weight_decay=args.weight_decay, - learning_rate=args.lr, - beta1=args.adam_beta, - beta2=args.adam_beta2, - epsilon=args.adam_eps, - ) - - training_args.set_lr_scheduler( - name=args.lr_scheduler_type, - num_epochs=args.num_epochs, - warmup_steps=args.warmup_iters, - ) - - training_args.set_logging( - strategy=args.log_name, - steps=args.log_interval, - report_to=args.report_name, - level=args.log_level, - ) - - training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED - trainer = CustomTrainer( - model=model, - args=training_args, - train_dataset=train_dataset, - eval_dataset=None, - #tokenizer=tokenizer, - ) - trainer.train() - trainer.save_model(args.save) - tokenizer.save_pretrained(args.save) - torch.distributed.destroy_process_group() +import os +import copy +#import wandb +import torch +from utils import get_model, get_tokenizer +from datasets import load_dataset +from torch.utils.data import DataLoader +from accelerate.utils import DistributedType +from transformers import TrainingArguments, Trainer +#from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +IGNORE_INDEX = -100 + +class CustomTrainer(Trainer): + def get_train_dataloader(self): + dataloader_params = { + "batch_size": self._train_batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "persistent_workers": self.args.dataloader_persistent_workers, + "drop_last": self.args.dataloader_drop_last, + "prefetch_factor": self.args.dataloader_prefetch_factor, + "shuffle": False, + #"sampler": self._get_train_sampler(), + #"worker_init_fn": seed_worker, + } + self.train_dataset = self._remove_unused_columns(self.train_dataset, description="training") + return self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params)) + + def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False): + outputs = model(**inputs) + losses = outputs.loss + return (losses, outputs) if return_outputs else losses + +def preprocess(tokenizer, examples, instruct_field, text_field, max_length=1024, padding="max_length"):# max_length, longest + instructs = examples.get(instruct_field, None) + texts = examples.get(text_field, None) + + input_ids_list = list() + labels_list = list() + attention_mask_list = list() + for i in range(len(texts)): + instruct = instructs[i] if isinstance(instructs, list) else "" + text = instruct + texts[i] + + tokenized_example = tokenizer(text, padding=padding, add_special_tokens=True, truncation=True, max_length=max_length) + input_ids = tokenized_example["input_ids"] + attention_mask = tokenized_example["attention_mask"] + labels = copy.deepcopy(input_ids) + + tokenized_instruct = tokenizer(instruct, padding=padding, add_special_tokens=True, truncation=True, max_length=max_length) + input_len = len(tokenized_instruct["input_ids"]) - tokenized_instruct["input_ids"].count(tokenizer.pad_token_id) + labels[:input_len] = [IGNORE_INDEX] * input_len + + input_ids_list.append(input_ids) + labels_list.append(labels) + attention_mask_list.append(attention_mask) + + return dict( + input_ids=input_ids_list, + labels=labels_list, + attention_mask=attention_mask_list, + ) + +def train(args): + # device. + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + print(f"device: {device}, cpu count: {os.cpu_count()}") + + # tokenizer. + # tokenizer = AutoTokenizer.from_pretrained(args.model_path) + tokenizer = get_tokenizer(args) + + # model. + # model = AutoModelForCausalLM.from_pretrained(args.model_path, torch_dtype="auto", trust_remote_code=True) + model = get_model(args, device=device) + model.train() + + # data. + dataset = load_dataset("json", data_files={"train": args.data_path}) + train_dataset = dataset["train"] + # For debug. + #train_dataset = train_dataset.select(range(100)) + + train_dataset = train_dataset.map( + lambda example: preprocess(tokenizer, example, args.instruct_field, args.text_field, args.max_length), + batched=True, num_proc=os.cpu_count(), load_from_cache_file=False) + #train_dataset = train_dataset.filter(lambda example: len(example["input_ids"]) > 0) + #if args.shuffle_data: + # train_dataset = train_dataset.shuffle(seed=args.seed) + + # train args. + #wandb.init(project=os.path.basename(args.save)) + training_args = TrainingArguments( + output_dir=args.save, + overwrite_output_dir=True, + fp16=True, + deepspeed=args.deepspeed_config, + lr_scheduler_kwargs={"min_lr": args.lr_min}, + #report_to="none", + save_strategy=args.save_strategy, + save_steps=args.save_steps, + save_total_limit=10, + ) + + training_args.set_training( + learning_rate=args.lr, + batch_size=args.batch_size, + weight_decay=args.weight_decay, + num_epochs=args.num_epochs, + gradient_accumulation_steps=args.gradient_accumulation_steps, + seed=args.seed, + gradient_checkpointing=args.gradient_checkpointing, + ) + + training_args.set_optimizer( + name=args.optimizer_name, + weight_decay=args.weight_decay, + learning_rate=args.lr, + beta1=args.adam_beta, + beta2=args.adam_beta2, + epsilon=args.adam_eps, + ) + + training_args.set_lr_scheduler( + name=args.lr_scheduler_type, + num_epochs=args.num_epochs, + warmup_steps=args.warmup_iters, + ) + + training_args.set_logging( + strategy=args.log_name, + steps=args.log_interval, + report_to=args.report_name, + level=args.log_level, + ) + + training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED + trainer = CustomTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=None, + #tokenizer=tokenizer, + ) + trainer.train() + trainer.save_model(args.save) + tokenizer.save_pretrained(args.save) + torch.distributed.destroy_process_group() diff --git a/requirements.txt b/requirements.txt index f39494e..a5594f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,18 @@ -git+https://github.com/t1101675/transformers.git@data_selection -git+https://github.com/EleutherAI/lm-evaluation-harness.git -torch -deepspeed==0.16.8 -numerize==0.12 -torchtyping==0.1.5 -rich==14.0.0 -accelerate==1.7.0 -datasets==3.6.0 -sentencepiece==0.2.1 -matplotlib==3.10.3 -wandb==0.19.11 -cvxpy==1.6.5 -h5py==3.13.0 -scikit-learn==1.6.1 -Levenshtein==0.27.1 -xformers==0.0.26.post1 -# kenlm==0.3.0 +git+https://github.com/t1101675/transformers.git@data_selection +git+https://github.com/EleutherAI/lm-evaluation-harness.git +torch +deepspeed==0.16.8 +numerize==0.12 +torchtyping==0.1.5 +rich==14.0.0 +accelerate==1.7.0 +datasets==3.6.0 +sentencepiece==0.2.1 +matplotlib==3.10.3 +wandb==0.19.11 +cvxpy==1.6.5 +h5py==3.13.0 +scikit-learn==1.6.1 +Levenshtein==0.27.1 +xformers==0.0.26.post1 +# kenlm==0.3.0 diff --git a/utils.py b/utils.py index adb98f4..9428a80 100644 --- a/utils.py +++ b/utils.py @@ -1,443 +1,443 @@ -import os -import yaml -import json -import time -import random -import argparse -import numpy as np -from datetime import timedelta -from numerize.numerize import numerize - -import torch -import torch.distributed as dist -from torch.distributed import get_rank -import deepspeed -from accelerate import load_checkpoint_and_dispatch, init_empty_weights -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from datasets import Dataset, load_dataset - -try: - from transformers import mpu -except: - mpu = None - -WANDB_PROJ_NAME = "DELT" -PAD_EOS_MODELS = ["gpt2", "opt", "llama", "mistral"] -BOS_MODELS = ["fairseq", "mistral", "llama"] -#HF_TOKEN = os.environ["HF_TOKEN"] -HF_TOKEN = None - - -# Logging -def print_args(args): - print('arguments:', flush=True) - for arg in vars(args): - dots = '.' * (29 - len(arg)) - print(' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True) - - -def save_rank(log_str, save_path, rank=0): - if not dist.is_initialized() or dist.get_rank() == rank: - with open(save_path, "a") as f: - f.write(log_str + "\n") - - -def print_rank(*args, rank=0, **kwargs): - if not dist.is_initialized() or dist.get_rank() == rank: - print(*args, **kwargs) - - -# Distributed -def all_gather(t, dim=0, world_size=None, group=None, op="cat"): - if world_size is None: - world_size = dist.get_world_size() - all_t = [torch.zeros_like(t) for _ in range(world_size)] - dist.all_gather(all_t, t, group=group) - if op == "cat": - all_t = torch.cat(all_t, dim=dim) - elif op == "stack": - all_t = torch.stack(all_t, dim=dim) - return all_t - - -# Initialize -def set_random_seed(seed, mp=False): - """Set random seed for reproducability.""" - if dist.is_initialized(): - seed = dist.get_rank() + seed - if seed is not None and seed > 0: - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - # if mp: - # mpu.model_parallel_cuda_manual_seed(seed) - - -def add_args(args, method_params, fields=None): - if isinstance(fields, str): - fields = [fields] - if fields is None: - fields = method_params.keys() - - for field in fields: - if field in method_params: - value = method_params[field] - if isinstance(value, dict): - for key, sub_value in value.items(): - if not hasattr(args, key) or getattr(args, key) is None: - setattr(args, key, sub_value) - else: - if not hasattr(args, field) or getattr(args, field) is None: - setattr(args, field, value) - return args - - -def base_training_hp_suffix(args): - suffix = "" - suffix += (f"e{args.epochs}" if args.epochs is not None else f"t{numerize(args.total_iters)}") + \ - (f"-w{numerize(args.warmup_iters)}" if args.warmup_iters > 0 else "") + \ - (f"-bs{args.batch_size}-lr{args.lr}{args.scheduler_name}-G{args.gradient_accumulation_steps}") + \ - (f"-mp{args.model_parallel_size}" if args.model_parallel > 0 else "") - return suffix - - -def base_infer_hp_suffix(args): - return "" - - -def base_model_suffix(args): - return f"{args.ckpt_name.replace('/', '_')}" - - -def base_data_suffix(args): - return f"{args.data_name.replace('/', '_')}" - - -def init_distributed(args): - args.rank = int(os.getenv("RANK", "0")) - args.world_size = int(os.getenv("WORLD_SIZE", "1")) - args.local_rank = int(os.getenv("LOCAL_RANK", "0")) - - if args.rank == 0: - print(f"using world size: {args.world_size}") - - # Manually set the device ids. - device = args.rank % torch.cuda.device_count() - if args.local_rank is not None: - device = args.local_rank - torch.cuda.set_device(device) - - dist.init_process_group(backend="nccl", timeout=timedelta(minutes=300)) - - -def init_distributed_ds(args): - args.rank = int(os.getenv("RANK", "0")) - args.world_size = int(os.getenv("WORLD_SIZE", "1")) - args.local_rank = int(os.getenv("LOCAL_RANK", "0")) - - if args.rank == 0: - print(f"using world size: {args.world_size}") - - # Manually set the device ids. - device = args.rank % torch.cuda.device_count() - if args.local_rank is not None: - device = args.local_rank - torch.cuda.set_device(device) - - deepspeed.init_distributed(timeout=timedelta(minutes=300)) - - -def init_deepspeed(args): - if args.deepspeed_config is not None: - with open(args.deepspeed_config, "r") as f: - ds_config = json.load(f) - - ds_config["gradient_accumulation_steps"] = args.gradient_accumulation_steps - ds_config["train_micro_batch_size_per_gpu"] = args.batch_size - ds_config["gradient_clipping"] = args.clip_grad - ds_config["steps_per_print"] = 10000000 - - if not args.do_train: - ds_config["zero_optimization"]["stage"] = 0 - - if not ds_config["fp16"]["enabled"]: - args.fp32 = True - - args.deepspeed_config = None - else: - ds_config = None - - return ds_config - - -def init_deepspeed_infer(args): - if args.deepspeed_config is not None: - with open(args.deepspeed_config, "r") as f: - ds_config = json.load(f) - - ds_config["zero_optimization"]["stage"] = 0 - - if not ds_config["fp16"]["enabled"]: - args.fp32 = True - - args.deepspeed_config = None - else: - ds_config = None - - return ds_config - - -def init_env(seed): - torch.set_num_threads(16) - torch.backends.cudnn.enabled = False - - print('Random Seed: ', seed) - torch.manual_seed(seed) - np.random.seed(seed) - random.seed(seed) - # if os.environ.get('DETERMINISTIC') is not None: - os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8" - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True - torch.use_deterministic_algorithms(True) - - # be consistent with nanogpt settings - torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul - torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn - torch.cuda.manual_seed_all(seed) - print('Set Random Seed Successful: ', seed) - - -def init(args, do_distributed=True): - # init distributed - if do_distributed: - if args.deepspeed: - init_distributed_ds(args) - else: - init_distributed(args) - - if args.model_parallel: - assert dist.get_world_size() % args.model_parallel_size == 0 - mpu.initialize_model_parallel(args.model_parallel_size) - - set_random_seed(args.seed, args.model_parallel) - # init save folder - if args.save != None: - os.makedirs(args.save, exist_ok=True) - - cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - args.time_stamp = cur_time - - init_env(args.seed) - - -# Load and save model -def get_model(args, device, model_path=None, config=None, from_scratch=None, model_cls=None): - if model_path is None: - model_path = args.model_path - print_rank("Initializing model from {}".format(model_path), rank=0) - print_rank(f"Attention Implementation: {args.attn_impl}") - if config is None: - config = AutoConfig.from_pretrained(model_path, attn_implementation=args.attn_impl) - - if args.dropout_path_rate is not None: - config.drop_path_rate = args.dropout_path_rate - if args.xops_attn: - assert args.attn_impl == "eager" - print_rank("Xops Attention") - config.use_memory_efficient_attention = True - else: - config.use_memory_efficient_attention = False - - if args.model_parallel: - config.is_model_parallel = True - with init_empty_weights(): - model = parallel_model_map[args.model_type].half() - load_parallel(model, args.model_path) - - if mpu.get_data_parallel_rank() == 0: - print(' > number of parameters on model parallel rank {}: {}'.format( - mpu.get_model_parallel_rank(), - sum([p.nelement() for p in model.parameters()])), flush=True) - else: - config.is_model_parallel = False - from_scratch = from_scratch if from_scratch is not None else args.from_scratch - model_cls = model_cls if model_cls is not None else AutoModelForCausalLM - if from_scratch: - print('Pre-train mode: train from scratch ...') - model = model_cls.from_config(config, attn_implementation=args.attn_impl).to(device) - else: - print('Fine-tune mode: load pre-train model from: ', str(model_path)) - dtype = torch.float32 if args.fp32 else torch.float16 - model = model_cls.from_pretrained(model_path, config=config, device_map={"": device}, torch_dtype=dtype) - # model = AutoModelForCausalLM.from_pretrained(args.model_path, torch_dtype="auto", trust_remote_code=True) - - #if dist.get_rank() == 0: - # print(' > number of parameters: {}'.format( - # sum([p.nelement() for p in model.parameters()])), flush=True) - # model = DDP(model) - # NOTE: no need for DDP since deepspeed has done - if args.gradient_checkpointing: - model.gradient_checkpointing_enable() - - return model - - -def get_tokenizer(args, model_path=None, model_type=None): - if model_path is None: - model_path = args.model_path - - if model_type is None: - model_type = args.model_type - - if args.max_length is None or not args.truncation: - tokenizer = AutoTokenizer.from_pretrained(model_path) - else: - tokenizer = AutoTokenizer.from_pretrained( - model_path, - model_max_length=args.max_length, - truncation=args.truncation, - #padding_side=args.padding_side, - #use_fast=False, - ) - - if model_type in PAD_EOS_MODELS: - #tokenizer.pad_token = tokenizer.eos_token - tokenizer.pad_token_id = tokenizer.eos_token_id - - return tokenizer - - -def load_parallel(model, load_dir): - mp_rank = mpu.get_model_parallel_rank() - assert mpu.get_model_parallel_world_size() != 1 - checkpoint_name = os.path.join(load_dir, f"mp{mpu.get_model_parallel_world_size()}", f"pytorch_model_{mp_rank}.bin") - assert os.path.exists(checkpoint_name), f"{checkpoint_name} does not exist." - model = load_checkpoint_and_dispatch(model=model, checkpoint=checkpoint_name, device_map={"": torch.cuda.current_device()}, dtype=torch.float16) - dist.barrier() - print(f"Rank {get_rank()}: {checkpoint_name} loaded.") - - -def save_parallel(model, save_dir): - mp_rank = mpu.get_model_parallel_rank() - os.makedirs(os.path.join(save_dir, f"mp{mpu.get_model_parallel_world_size()}"), exist_ok=True) - checkpoint_name = os.path.join(save_dir, f"mp{mpu.get_model_parallel_world_size()}", f"pytorch_model_{mp_rank}.bin") - torch.save(model.state_dict(), checkpoint_name) - print(f"Rank {get_rank()}: {checkpoint_name} saved.") - - -def load_yaml(file_path): - with open(file_path, "r") as file: - data = yaml.safe_load(file) - return data - - -def write_yaml(file_path, data): - with open(file_path, "w") as file: - yaml.dump(data, file, default_flow_style=False) - - -def load_jsonl(file_path): - with open(file_path, "r", encoding="utf-8") as file: - data = [json.loads(line) for line in file] - return data - - -def write_jsonl(file_path, data): - with open(file_path, "w", encoding="utf-8") as file: - for entry in data: - json_line = json.dumps(entry) - file.write(json_line + "\n") - - -def download_repo(repo_id, save_dir, repo_type="model", revision=None, token=None, allow_patterns=None, ignore_patterns=None): - from huggingface_hub import snapshot_download - - snapshot_download( - repo_id=repo_id, - repo_type=repo_type, - revision=revision, - local_dir=save_dir, - local_dir_use_symlinks=False, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns, - token=HF_TOKEN, - ) - print(f"Repository '{repo_id}' ({repo_type}) downloaded to: {save_dir}") - - -def download_model(model_id, save_dir): - tokenizer = AutoTokenizer.from_pretrained( - model_id, - token=HF_TOKEN, - trust_remote_code=True, - ) - tokenizer.save_pretrained(save_dir) - - model = AutoModelForCausalLM.from_pretrained( - model_id, - token=HF_TOKEN, - trust_remote_code=True, - ) - model.save_pretrained(save_dir, safe_serialization=False) - print(f"Model '{model_id}' has been saved to '{save_dir}'.") - - -def download_data(dataset_id, name, save_dir, split_name=None, sample_size=-1): - dataset = load_dataset( - dataset_id, - name=name, - split=(split_name if split_name != "" else None), - streaming=False, - token=HF_TOKEN, - trust_remote_code=True, - ) - - sampled_data = [] - for i, example in enumerate(dataset): - if sample_size > 0 and i >= sample_size: - break - sampled_data.append(example) - - sampled_dataset = Dataset.from_list(sampled_data) - sampled_dataset.to_json(save_dir) - - print(f"Dataset '{dataset_id}' has been saved to '{save_dir}' with {len(sampled_data)} samples.") - - -def download_redpajama_samples(save_path): - os.makedirs(os.path.dirname(save_path), exist_ok=True) - base_url = "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample/resolve/main/" - names = ( - "cc_2019-30_sample.jsonl", - "cc_2020-05_sample.jsonl", - "cc_2021-04_sample.jsonl", - "cc_2022-05_sample.jsonl", - "cc_2023-06_sample.jsonl" - ) - for name in names: - os.system(f"wget -O - {base_url + name} >> {save_path}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download HF dataset or model.") - parser.add_argument("--content", type=str, required=True, choices=["dataset", "model"], default="dataset", help="The content to be downloaded.") - parser.add_argument("--id", type=str, required=True, help="Input dataset id or model id.") - parser.add_argument("--data-name", type=str, required=False, default=None, help="Split name of dataset.") - parser.add_argument("--save-dir", type=str, required=True, help="Output path of saved dataset or model.") - parser.add_argument("--split-name", type=str, required=False, default=None, help="Split name of dataset.") - parser.add_argument("--sample-size", type=int, required=False, default=-1, help="Sample size of dataset.") - - args = parser.parse_args() - - if args.content == "model": - download_model(args.id, args.save_dir) - - if args.content == "dataset": - if args.id == "togethercomputer/RedPajama-Data-1T-Sample": - download_redpajama_samples(args.save_dir) - else: - download_data(args.id, args.data_name, args.save_dir, args.split_name, args.sample_size) - - if args.content == "repo": - download_repo(args.id, args.save_dir) +import os +import yaml +import json +import time +import random +import argparse +import numpy as np +from datetime import timedelta +from numerize.numerize import numerize + +import torch +import torch.distributed as dist +from torch.distributed import get_rank +import deepspeed +from accelerate import load_checkpoint_and_dispatch, init_empty_weights +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from datasets import Dataset, load_dataset + +try: + from transformers import mpu +except: + mpu = None + +WANDB_PROJ_NAME = "DELT" +PAD_EOS_MODELS = ["gpt2", "opt", "llama", "mistral"] +BOS_MODELS = ["fairseq", "mistral", "llama"] +#HF_TOKEN = os.environ["HF_TOKEN"] +HF_TOKEN = None + + +# Logging +def print_args(args): + print('arguments:', flush=True) + for arg in vars(args): + dots = '.' * (29 - len(arg)) + print(' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True) + + +def save_rank(log_str, save_path, rank=0): + if not dist.is_initialized() or dist.get_rank() == rank: + with open(save_path, "a") as f: + f.write(log_str + "\n") + + +def print_rank(*args, rank=0, **kwargs): + if not dist.is_initialized() or dist.get_rank() == rank: + print(*args, **kwargs) + + +# Distributed +def all_gather(t, dim=0, world_size=None, group=None, op="cat"): + if world_size is None: + world_size = dist.get_world_size() + all_t = [torch.zeros_like(t) for _ in range(world_size)] + dist.all_gather(all_t, t, group=group) + if op == "cat": + all_t = torch.cat(all_t, dim=dim) + elif op == "stack": + all_t = torch.stack(all_t, dim=dim) + return all_t + + +# Initialize +def set_random_seed(seed, mp=False): + """Set random seed for reproducability.""" + if dist.is_initialized(): + seed = dist.get_rank() + seed + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + # if mp: + # mpu.model_parallel_cuda_manual_seed(seed) + + +def add_args(args, method_params, fields=None): + if isinstance(fields, str): + fields = [fields] + if fields is None: + fields = method_params.keys() + + for field in fields: + if field in method_params: + value = method_params[field] + if isinstance(value, dict): + for key, sub_value in value.items(): + if not hasattr(args, key) or getattr(args, key) is None: + setattr(args, key, sub_value) + else: + if not hasattr(args, field) or getattr(args, field) is None: + setattr(args, field, value) + return args + + +def base_training_hp_suffix(args): + suffix = "" + suffix += (f"e{args.epochs}" if args.epochs is not None else f"t{numerize(args.total_iters)}") + \ + (f"-w{numerize(args.warmup_iters)}" if args.warmup_iters > 0 else "") + \ + (f"-bs{args.batch_size}-lr{args.lr}{args.scheduler_name}-G{args.gradient_accumulation_steps}") + \ + (f"-mp{args.model_parallel_size}" if args.model_parallel > 0 else "") + return suffix + + +def base_infer_hp_suffix(args): + return "" + + +def base_model_suffix(args): + return f"{args.ckpt_name.replace('/', '_')}" + + +def base_data_suffix(args): + return f"{args.data_name.replace('/', '_')}" + + +def init_distributed(args): + args.rank = int(os.getenv("RANK", "0")) + args.world_size = int(os.getenv("WORLD_SIZE", "1")) + args.local_rank = int(os.getenv("LOCAL_RANK", "0")) + + if args.rank == 0: + print(f"using world size: {args.world_size}") + + # Manually set the device ids. + device = args.rank % torch.cuda.device_count() + if args.local_rank is not None: + device = args.local_rank + torch.cuda.set_device(device) + + dist.init_process_group(backend="nccl", timeout=timedelta(minutes=300)) + + +def init_distributed_ds(args): + args.rank = int(os.getenv("RANK", "0")) + args.world_size = int(os.getenv("WORLD_SIZE", "1")) + args.local_rank = int(os.getenv("LOCAL_RANK", "0")) + + if args.rank == 0: + print(f"using world size: {args.world_size}") + + # Manually set the device ids. + device = args.rank % torch.cuda.device_count() + if args.local_rank is not None: + device = args.local_rank + torch.cuda.set_device(device) + + deepspeed.init_distributed(timeout=timedelta(minutes=300)) + + +def init_deepspeed(args): + if args.deepspeed_config is not None: + with open(args.deepspeed_config, "r") as f: + ds_config = json.load(f) + + ds_config["gradient_accumulation_steps"] = args.gradient_accumulation_steps + ds_config["train_micro_batch_size_per_gpu"] = args.batch_size + ds_config["gradient_clipping"] = args.clip_grad + ds_config["steps_per_print"] = 10000000 + + if not args.do_train: + ds_config["zero_optimization"]["stage"] = 0 + + if not ds_config["fp16"]["enabled"]: + args.fp32 = True + + args.deepspeed_config = None + else: + ds_config = None + + return ds_config + + +def init_deepspeed_infer(args): + if args.deepspeed_config is not None: + with open(args.deepspeed_config, "r") as f: + ds_config = json.load(f) + + ds_config["zero_optimization"]["stage"] = 0 + + if not ds_config["fp16"]["enabled"]: + args.fp32 = True + + args.deepspeed_config = None + else: + ds_config = None + + return ds_config + + +def init_env(seed): + torch.set_num_threads(16) + torch.backends.cudnn.enabled = False + + print('Random Seed: ', seed) + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + # if os.environ.get('DETERMINISTIC') is not None: + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8" + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + torch.use_deterministic_algorithms(True) + + # be consistent with nanogpt settings + torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul + torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn + torch.cuda.manual_seed_all(seed) + print('Set Random Seed Successful: ', seed) + + +def init(args, do_distributed=True): + # init distributed + if do_distributed: + if args.deepspeed: + init_distributed_ds(args) + else: + init_distributed(args) + + if args.model_parallel: + assert dist.get_world_size() % args.model_parallel_size == 0 + mpu.initialize_model_parallel(args.model_parallel_size) + + set_random_seed(args.seed, args.model_parallel) + # init save folder + if args.save != None: + os.makedirs(args.save, exist_ok=True) + + cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + args.time_stamp = cur_time + + init_env(args.seed) + + +# Load and save model +def get_model(args, device, model_path=None, config=None, from_scratch=None, model_cls=None): + if model_path is None: + model_path = args.model_path + print_rank("Initializing model from {}".format(model_path), rank=0) + print_rank(f"Attention Implementation: {args.attn_impl}") + if config is None: + config = AutoConfig.from_pretrained(model_path, attn_implementation=args.attn_impl) + + if args.dropout_path_rate is not None: + config.drop_path_rate = args.dropout_path_rate + if args.xops_attn: + assert args.attn_impl == "eager" + print_rank("Xops Attention") + config.use_memory_efficient_attention = True + else: + config.use_memory_efficient_attention = False + + if args.model_parallel: + config.is_model_parallel = True + with init_empty_weights(): + model = parallel_model_map[args.model_type].half() + load_parallel(model, args.model_path) + + if mpu.get_data_parallel_rank() == 0: + print(' > number of parameters on model parallel rank {}: {}'.format( + mpu.get_model_parallel_rank(), + sum([p.nelement() for p in model.parameters()])), flush=True) + else: + config.is_model_parallel = False + from_scratch = from_scratch if from_scratch is not None else args.from_scratch + model_cls = model_cls if model_cls is not None else AutoModelForCausalLM + if from_scratch: + print('Pre-train mode: train from scratch ...') + model = model_cls.from_config(config, attn_implementation=args.attn_impl).to(device) + else: + print('Fine-tune mode: load pre-train model from: ', str(model_path)) + dtype = torch.float32 if args.fp32 else torch.float16 + model = model_cls.from_pretrained(model_path, config=config, device_map={"": device}, torch_dtype=dtype) + # model = AutoModelForCausalLM.from_pretrained(args.model_path, torch_dtype="auto", trust_remote_code=True) + + #if dist.get_rank() == 0: + # print(' > number of parameters: {}'.format( + # sum([p.nelement() for p in model.parameters()])), flush=True) + # model = DDP(model) + # NOTE: no need for DDP since deepspeed has done + if args.gradient_checkpointing: + model.gradient_checkpointing_enable() + + return model + + +def get_tokenizer(args, model_path=None, model_type=None): + if model_path is None: + model_path = args.model_path + + if model_type is None: + model_type = args.model_type + + if args.max_length is None or not args.truncation: + tokenizer = AutoTokenizer.from_pretrained(model_path) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_path, + model_max_length=args.max_length, + truncation=args.truncation, + #padding_side=args.padding_side, + #use_fast=False, + ) + + if model_type in PAD_EOS_MODELS: + #tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + + return tokenizer + + +def load_parallel(model, load_dir): + mp_rank = mpu.get_model_parallel_rank() + assert mpu.get_model_parallel_world_size() != 1 + checkpoint_name = os.path.join(load_dir, f"mp{mpu.get_model_parallel_world_size()}", f"pytorch_model_{mp_rank}.bin") + assert os.path.exists(checkpoint_name), f"{checkpoint_name} does not exist." + model = load_checkpoint_and_dispatch(model=model, checkpoint=checkpoint_name, device_map={"": torch.cuda.current_device()}, dtype=torch.float16) + dist.barrier() + print(f"Rank {get_rank()}: {checkpoint_name} loaded.") + + +def save_parallel(model, save_dir): + mp_rank = mpu.get_model_parallel_rank() + os.makedirs(os.path.join(save_dir, f"mp{mpu.get_model_parallel_world_size()}"), exist_ok=True) + checkpoint_name = os.path.join(save_dir, f"mp{mpu.get_model_parallel_world_size()}", f"pytorch_model_{mp_rank}.bin") + torch.save(model.state_dict(), checkpoint_name) + print(f"Rank {get_rank()}: {checkpoint_name} saved.") + + +def load_yaml(file_path): + with open(file_path, "r") as file: + data = yaml.safe_load(file) + return data + + +def write_yaml(file_path, data): + with open(file_path, "w") as file: + yaml.dump(data, file, default_flow_style=False) + + +def load_jsonl(file_path): + with open(file_path, "r", encoding="utf-8") as file: + data = [json.loads(line) for line in file] + return data + + +def write_jsonl(file_path, data): + with open(file_path, "w", encoding="utf-8") as file: + for entry in data: + json_line = json.dumps(entry) + file.write(json_line + "\n") + + +def download_repo(repo_id, save_dir, repo_type="model", revision=None, token=None, allow_patterns=None, ignore_patterns=None): + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id=repo_id, + repo_type=repo_type, + revision=revision, + local_dir=save_dir, + local_dir_use_symlinks=False, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + token=HF_TOKEN, + ) + print(f"Repository '{repo_id}' ({repo_type}) downloaded to: {save_dir}") + + +def download_model(model_id, save_dir): + tokenizer = AutoTokenizer.from_pretrained( + model_id, + token=HF_TOKEN, + trust_remote_code=True, + ) + tokenizer.save_pretrained(save_dir) + + model = AutoModelForCausalLM.from_pretrained( + model_id, + token=HF_TOKEN, + trust_remote_code=True, + ) + model.save_pretrained(save_dir, safe_serialization=False) + print(f"Model '{model_id}' has been saved to '{save_dir}'.") + + +def download_data(dataset_id, name, save_dir, split_name=None, sample_size=-1): + dataset = load_dataset( + dataset_id, + name=name, + split=(split_name if split_name != "" else None), + streaming=False, + token=HF_TOKEN, + trust_remote_code=True, + ) + + sampled_data = [] + for i, example in enumerate(dataset): + if sample_size > 0 and i >= sample_size: + break + sampled_data.append(example) + + sampled_dataset = Dataset.from_list(sampled_data) + sampled_dataset.to_json(save_dir) + + print(f"Dataset '{dataset_id}' has been saved to '{save_dir}' with {len(sampled_data)} samples.") + + +def download_redpajama_samples(save_path): + os.makedirs(os.path.dirname(save_path), exist_ok=True) + base_url = "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample/resolve/main/" + names = ( + "cc_2019-30_sample.jsonl", + "cc_2020-05_sample.jsonl", + "cc_2021-04_sample.jsonl", + "cc_2022-05_sample.jsonl", + "cc_2023-06_sample.jsonl" + ) + for name in names: + os.system(f"wget -O - {base_url + name} >> {save_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download HF dataset or model.") + parser.add_argument("--content", type=str, required=True, choices=["dataset", "model"], default="dataset", help="The content to be downloaded.") + parser.add_argument("--id", type=str, required=True, help="Input dataset id or model id.") + parser.add_argument("--data-name", type=str, required=False, default=None, help="Split name of dataset.") + parser.add_argument("--save-dir", type=str, required=True, help="Output path of saved dataset or model.") + parser.add_argument("--split-name", type=str, required=False, default=None, help="Split name of dataset.") + parser.add_argument("--sample-size", type=int, required=False, default=-1, help="Sample size of dataset.") + + args = parser.parse_args() + + if args.content == "model": + download_model(args.id, args.save_dir) + + if args.content == "dataset": + if args.id == "togethercomputer/RedPajama-Data-1T-Sample": + download_redpajama_samples(args.save_dir) + else: + download_data(args.id, args.data_name, args.save_dir, args.split_name, args.sample_size) + + if args.content == "repo": + download_repo(args.id, args.save_dir)