From 54d3560daf1dc50a390c722e5175aab62333d1ca Mon Sep 17 00:00:00 2001 From: Anmol Mekala <49127549+molereddy@users.noreply.github.com> Date: Sat, 1 Mar 2025 09:13:50 -0500 Subject: [PATCH 01/15] Fix hyperlinks in README (#2) * testing commit * Fixes * cleanup --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dce38e5a..a04d4866 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ We provide several variants for each of the components in the unlearning pipelin ## πŸ“Œ Table of Contents - πŸ“– [Overview](#-overview) -- πŸ—ƒοΈ [Available Components](#-available-components) +- πŸ—ƒοΈ [Available Components](#%EF%B8%8F-available-components) - ⚑ [Quickstart](#-quickstart) - πŸ› οΈ [Environment Setup](#-environment-setup) - πŸ’Ύ [Data Setup](#-data-setup) @@ -56,7 +56,7 @@ We provide several variants for each of the components in the unlearning pipelin - βž• [How to Add New Components](#-how-to-add-new-components) - πŸ“š [Further Documentation](#-further-documentation) - πŸ”— [Support & Contributors](#-support--contributors) -- πŸ“ [Citing this work](#-citating-this-work) +- πŸ“ [Citing this work](#-citing-this-work) - 🀝 [Acknowledgements](#-acknowledgements) - πŸ“„ [License](#-license) @@ -198,7 +198,7 @@ If you use OpenUnlearning in your research, please cite: --- -### 🀝 Acknowledgments +### 🀝 Acknowledgements - This repo is inspired from [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory). - The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/jaechan-repo/muse_bench) benchmarks served as the foundation for our re-implementation. From 4c36e4f5a39d979280efd20452be7ba5ff54e40a Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 18:37:19 +0000 Subject: [PATCH 02/15] Fixed DPO command --- scripts/tofu_unlearn.sh | 56 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index a556bd1d..eba38ff4 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -9,12 +9,6 @@ models=( "Llama-3.2-3B-Instruct" "Llama-3.1-8B-Instruct" ) -trainers_experiments=( - "GradAscent unlearn/tofu/default.yaml" - "GradDiff unlearn/tofu/default.yaml" - "NPO unlearn/tofu/default.yaml" - "DPO unlearn/tofu/default.yaml" -) forget_retain_splits=( "forget01 retain99" "forget05 retain95" @@ -29,7 +23,57 @@ gradient_accumulation_steps=4 ########################################### Unlearn TOFU models ######################################################## ######################################################################################################################## +trainers_experiments=( + "GradAscent unlearn/tofu/default.yaml" + "GradDiff unlearn/tofu/default.yaml" + "NPO unlearn/tofu/default.yaml" +) +for split in "${forget_retain_splits[@]}"; do + forget_split=$(echo $split | cut -d' ' -f1) + retain_split=$(echo $split | cut -d' ' -f2) + for model in "${models[@]}"; do + for trainer_experiment in "${trainers_experiments[@]}"; do + trainer=$(echo $trainer_experiment | cut -d' ' -f1) + experiment=$(echo $trainer_experiment | cut -d' ' -f2) + + task_name=tofu_${model}_${forget_split}_${trainer} + model_path=open-unlearning/tofu_${model}_full + echo ${task_name}: Unlearning ${model_path} using ${trainer} + + # Unlearn + CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ + src/train.py --config-name=unlearn.yaml \ + experiment=${experiment} \ + trainer=${trainer} \ + task_name=${task_name} \ + model=${model} \ + forget_split=${forget_split} \ + retain_split=${retain_split} \ + model.model_args.pretrained_model_name_or_path=${model_path} \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ + trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ + trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ + trainer.args.ddp_find_unused_parameters=true \ + trainer.args.gradient_checkpointing=true + + # Eval + CUDA_VISIBLE_DEVICES=0 python src/eval.py \ + experiment=eval/tofu/default.yaml \ + forget_split=${forget_split} \ + model=${model} \ + task_name=${task_name} \ + model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ + paths.output_dir=saves/unlearn/${task_name}/evals \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json + done + done +done + + +trainers_experiments=( + "DPO unlearn/tofu/idk.yaml" +) for split in "${forget_retain_splits[@]}"; do forget_split=$(echo $split | cut -d' ' -f1) retain_split=$(echo $split | cut -d' ' -f2) From f7a69dee40cfe918e32250ce7c1ba7564205c9b4 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 18:50:47 +0000 Subject: [PATCH 03/15] download idk --- setup_data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/setup_data.py b/setup_data.py index 48de0ad1..358779c3 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,8 +1,17 @@ from huggingface_hub import snapshot_download +# Setup retain model metrics snapshot_download( repo_id="open-unlearning/eval", allow_patterns="*.json", repo_type="dataset", local_dir="saves/eval", ) + +# Setup data +snapshot_download( + repo_id="open-unlearning/idk", + allow_patterns="*.jsonl", + repo_type="dataset", + local_dir="data", +) \ No newline at end of file From 332af36c4772eb1e836b767adbd102951693b60c Mon Sep 17 00:00:00 2001 From: Vineeth <48151992+Dornavineeth@users.noreply.github.com> Date: Sun, 2 Mar 2025 14:13:43 -0500 Subject: [PATCH 04/15] Revert "Dpo fix" --- scripts/tofu_unlearn.sh | 56 +++++------------------------------------ setup_data.py | 9 ------- 2 files changed, 6 insertions(+), 59 deletions(-) diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index eba38ff4..a556bd1d 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -9,6 +9,12 @@ models=( "Llama-3.2-3B-Instruct" "Llama-3.1-8B-Instruct" ) +trainers_experiments=( + "GradAscent unlearn/tofu/default.yaml" + "GradDiff unlearn/tofu/default.yaml" + "NPO unlearn/tofu/default.yaml" + "DPO unlearn/tofu/default.yaml" +) forget_retain_splits=( "forget01 retain99" "forget05 retain95" @@ -23,57 +29,7 @@ gradient_accumulation_steps=4 ########################################### Unlearn TOFU models ######################################################## ######################################################################################################################## -trainers_experiments=( - "GradAscent unlearn/tofu/default.yaml" - "GradDiff unlearn/tofu/default.yaml" - "NPO unlearn/tofu/default.yaml" -) -for split in "${forget_retain_splits[@]}"; do - forget_split=$(echo $split | cut -d' ' -f1) - retain_split=$(echo $split | cut -d' ' -f2) - for model in "${models[@]}"; do - for trainer_experiment in "${trainers_experiments[@]}"; do - trainer=$(echo $trainer_experiment | cut -d' ' -f1) - experiment=$(echo $trainer_experiment | cut -d' ' -f2) - - task_name=tofu_${model}_${forget_split}_${trainer} - model_path=open-unlearning/tofu_${model}_full - echo ${task_name}: Unlearning ${model_path} using ${trainer} - - # Unlearn - CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ - src/train.py --config-name=unlearn.yaml \ - experiment=${experiment} \ - trainer=${trainer} \ - task_name=${task_name} \ - model=${model} \ - forget_split=${forget_split} \ - retain_split=${retain_split} \ - model.model_args.pretrained_model_name_or_path=${model_path} \ - retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ - trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ - trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ - trainer.args.ddp_find_unused_parameters=true \ - trainer.args.gradient_checkpointing=true - - # Eval - CUDA_VISIBLE_DEVICES=0 python src/eval.py \ - experiment=eval/tofu/default.yaml \ - forget_split=${forget_split} \ - model=${model} \ - task_name=${task_name} \ - model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ - paths.output_dir=saves/unlearn/${task_name}/evals \ - retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json - done - done -done - - -trainers_experiments=( - "DPO unlearn/tofu/idk.yaml" -) for split in "${forget_retain_splits[@]}"; do forget_split=$(echo $split | cut -d' ' -f1) retain_split=$(echo $split | cut -d' ' -f2) diff --git a/setup_data.py b/setup_data.py index 358779c3..48de0ad1 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,17 +1,8 @@ from huggingface_hub import snapshot_download -# Setup retain model metrics snapshot_download( repo_id="open-unlearning/eval", allow_patterns="*.json", repo_type="dataset", local_dir="saves/eval", ) - -# Setup data -snapshot_download( - repo_id="open-unlearning/idk", - allow_patterns="*.jsonl", - repo_type="dataset", - local_dir="data", -) \ No newline at end of file From f468efb9eaa0c737e6e8b4e64abb411131ff7a99 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 19:22:21 +0000 Subject: [PATCH 05/15] download idk data --- setup_data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/setup_data.py b/setup_data.py index 48de0ad1..358779c3 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,8 +1,17 @@ from huggingface_hub import snapshot_download +# Setup retain model metrics snapshot_download( repo_id="open-unlearning/eval", allow_patterns="*.json", repo_type="dataset", local_dir="saves/eval", ) + +# Setup data +snapshot_download( + repo_id="open-unlearning/idk", + allow_patterns="*.jsonl", + repo_type="dataset", + local_dir="data", +) \ No newline at end of file From ca8d5038b07ca8b8bbf0a71bf8a9a5502899f154 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 19:22:45 +0000 Subject: [PATCH 06/15] fix dpo experiment config --- scripts/tofu_unlearn.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index a556bd1d..1794c9b6 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -13,7 +13,7 @@ trainers_experiments=( "GradAscent unlearn/tofu/default.yaml" "GradDiff unlearn/tofu/default.yaml" "NPO unlearn/tofu/default.yaml" - "DPO unlearn/tofu/default.yaml" + "DPO unlearn/tofu/idk.yaml" ) forget_retain_splits=( "forget01 retain99" From 8b073d66a7a08b4923eeffa6cbb485ffcc3246eb Mon Sep 17 00:00:00 2001 From: Vineeth <48151992+Dornavineeth@users.noreply.github.com> Date: Sun, 9 Mar 2025 15:32:43 -0400 Subject: [PATCH 07/15] RMU (#6) * IdkDPO script fix in tofu_unlearn.sh (#65) * Fix hyperlinks in README * Download I don't know data in setup_data.py * Fix tofu_unlearn.sh for IdkDPO --------- Co-authored-by: Anmol Mekala <49127549+molereddy@users.noreply.github.com> * overwrite=True * RMU added * Fix ref model device * ruff fix * RMU updated * Update rmu.py * Update README.md: add RMU * Added references and renamed functions --------- Co-authored-by: Anmol Mekala <49127549+molereddy@users.noreply.github.com> --- README.md | 4 +- configs/experiment/unlearn/muse/default.yaml | 1 + .../experiment/unlearn/muse/scalability.yaml | 1 + .../unlearn/muse/sustainabilty.yaml | 1 + configs/experiment/unlearn/tofu/default.yaml | 1 + configs/experiment/unlearn/tofu/idk.yaml | 1 + configs/trainer/RMU.yaml | 14 ++ docs/results.md | 37 ++++- scripts/tofu_unlearn.sh | 1 + src/trainer/__init__.py | 2 + src/trainer/unlearn/grad_diff.py | 2 +- src/trainer/unlearn/rmu.py | 142 ++++++++++++++++++ 12 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 configs/trainer/RMU.yaml create mode 100644 src/trainer/unlearn/rmu.py diff --git a/README.md b/README.md index a04d4866..4c2cf8a3 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ ## πŸ“– Overview -We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 5 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 7+ LLMs. Each of these can be easily extended to incorporate more variants. +We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 7+ LLMs. Each of these can be easily extended to incorporate more variants. We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field. @@ -35,7 +35,7 @@ We provide several variants for each of the components in the unlearning pipelin | **Component** | **Available Options** | |------------------------|----------------------| | **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/) | -| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO | +| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | | **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, MIA Attacks, TruthRatio, Model Utility | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) | | **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2, ICLM; Additional: Phi-3.5, Phi-1.5, Gemma | diff --git a/configs/experiment/unlearn/muse/default.yaml b/configs/experiment/unlearn/muse/default.yaml index 454a84e3..b4bdbe0f 100644 --- a/configs/experiment/unlearn/muse/default.yaml +++ b/configs/experiment/unlearn/muse/default.yaml @@ -34,6 +34,7 @@ eval: muse: data_split: ${data_split} retain_logs_path: ${retain_logs_path} + overwrite: true trainer: args: diff --git a/configs/experiment/unlearn/muse/scalability.yaml b/configs/experiment/unlearn/muse/scalability.yaml index 11d90f50..b19e0cb5 100644 --- a/configs/experiment/unlearn/muse/scalability.yaml +++ b/configs/experiment/unlearn/muse/scalability.yaml @@ -34,6 +34,7 @@ eval: muse: data_split: ${data_split} retain_logs_path: ${retain_logs_path} + overwrite: true trainer: args: diff --git a/configs/experiment/unlearn/muse/sustainabilty.yaml b/configs/experiment/unlearn/muse/sustainabilty.yaml index e5d79687..9a0a03e3 100644 --- a/configs/experiment/unlearn/muse/sustainabilty.yaml +++ b/configs/experiment/unlearn/muse/sustainabilty.yaml @@ -34,6 +34,7 @@ eval: muse: data_split: ${data_split} retain_logs_path: ${retain_logs_path} + overwrite: true trainer: args: diff --git a/configs/experiment/unlearn/tofu/default.yaml b/configs/experiment/unlearn/tofu/default.yaml index 5f7c4757..f2e0ab1a 100644 --- a/configs/experiment/unlearn/tofu/default.yaml +++ b/configs/experiment/unlearn/tofu/default.yaml @@ -20,6 +20,7 @@ eval: tofu: forget_split: ${forget_split} retain_logs_path: ${retain_logs_path} + overwrite: true data: anchor: forget diff --git a/configs/experiment/unlearn/tofu/idk.yaml b/configs/experiment/unlearn/tofu/idk.yaml index 61a365d0..5fcb85df 100644 --- a/configs/experiment/unlearn/tofu/idk.yaml +++ b/configs/experiment/unlearn/tofu/idk.yaml @@ -20,6 +20,7 @@ eval: tofu: forget_split: ${forget_split} retain_logs_path: ${retain_logs_path} + overwrite: true data: anchor: forget diff --git a/configs/trainer/RMU.yaml b/configs/trainer/RMU.yaml new file mode 100644 index 00000000..7e1f9028 --- /dev/null +++ b/configs/trainer/RMU.yaml @@ -0,0 +1,14 @@ +defaults: + - GradDiff + +handler: RMU +method_args: + # The params here are more dependent on model and dataset. Tune them carefully to work + gamma: 1.0 + steering_coeff: 2 + retain_loss_type: EMBED_DIFF + alpha: 1 + module_regex: model\.layers\.7 + trainable_params_regex: + - .* # update all parameters (as done in https://github.com/tmlr-group/G-effect/blob/ef368eea3b2c6dba1e090b9ebb021ac9f047e0ae/dataloader.py#L271) + # - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26) \ No newline at end of file diff --git a/docs/results.md b/docs/results.md index 3f7cc7c9..3af7cb66 100644 --- a/docs/results.md +++ b/docs/results.md @@ -23,7 +23,7 @@ For all the experiments below, we used the following setup | **Hyperparameters** | Learning Rate (lr) = 1e-5
Ξ± = 1, Ξ³ = 1, Ξ² = 0.1 (where applicable)
Number of Epochs = 10
Optimizer: [paged_adamw_32bit](https://huggingface.co/docs/bitsandbytes/main/en/reference/optim/adamw#bitsandbytes.optim.PagedAdamW) | __Note:__ -1. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO, can be significantly improved with careful tuning. **Please use these numbers only for reproducibility purposes**. +1. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO & RMU can be significantly improved with careful tuning. **Please use these numbers only for reproducibility purposes**. 2. NPO in MUSE: for NPO, the MUSE implementation is inconsistent with the [original paper](https://github.com/licong-lin/negative-preference-optimization) as discussed [here]( https://github.com/jaechan-repo/muse_bench/issues/2). This inconsistency is carried over into implementations like [SimNPO](https://github.com/OPTML-Group/Unlearn-Simple/issues/5). Here, we use the original NPO implementation with the same loss function expression across datasets. @@ -140,6 +140,18 @@ __Note:__ 0.6 3.17e-04 + + RMU + 0.4 + 0.62 + 0.64 + 9.59e-10 + 0.02 + 0.81 + 6.92e-21 + 0.03 + 0.81 + @@ -257,6 +269,18 @@ __Note:__ 0.54 1.07e-05 + + RMU + 0.16 + 0.55 + 0.70 + 4.87e-10 + 0.58 + 0.77 + 3.15e-15 + 0.59 + 0.76 + @@ -354,6 +378,17 @@ __Note:__ -54.26 0.54 + + RMU + 0.48 + 0.05 + 56.36 + 0.51 + 0.29 + 0.79 + -60.52 + 0.48 + \ No newline at end of file diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index 1794c9b6..ae33189f 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -14,6 +14,7 @@ trainers_experiments=( "GradDiff unlearn/tofu/default.yaml" "NPO unlearn/tofu/default.yaml" "DPO unlearn/tofu/idk.yaml" + "RMU unlearn/tofu/default.yaml" ) forget_retain_splits=( "forget01 retain99" diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py index 1c769bf6..7e195fa9 100644 --- a/src/trainer/__init__.py +++ b/src/trainer/__init__.py @@ -9,6 +9,7 @@ from trainer.unlearn.npo import NPO from trainer.unlearn.dpo import DPO from trainer.unlearn.simnpo import SimNPO +from trainer.unlearn.rmu import RMU TRAINER_REGISTRY: Dict[str, Any] = {} @@ -79,3 +80,4 @@ def load_trainer( _register_trainer(NPO) _register_trainer(DPO) _register_trainer(SimNPO) +_register_trainer(RMU) diff --git a/src/trainer/unlearn/grad_diff.py b/src/trainer/unlearn/grad_diff.py index e11c7a71..bfecc19a 100644 --- a/src/trainer/unlearn/grad_diff.py +++ b/src/trainer/unlearn/grad_diff.py @@ -14,7 +14,7 @@ def __init__(self, gamma=1.0, alpha=1.0, retain_loss_type="NLL", *args, **kwargs self.ref_model = self._prepare_ref_model(self.model) def _prepare_ref_model(self, model): - ref_model = copy.deepcopy(model).to("cuda") + ref_model = copy.deepcopy(model).to(self.accelerator.device) ref_model.eval() if self.is_deepspeed_enabled: ref_model = self._prepare_deepspeed(ref_model) diff --git a/src/trainer/unlearn/rmu.py b/src/trainer/unlearn/rmu.py new file mode 100644 index 00000000..391bd6ad --- /dev/null +++ b/src/trainer/unlearn/rmu.py @@ -0,0 +1,142 @@ +"""Borrowed implementation from https://github.com/centerforaisafety/wmdp/blob/main/rmu/unlearn.py""" + +import re +import torch +import deepspeed +from trainer.unlearn.grad_diff import GradDiff + + +class RMU(GradDiff): + def __init__(self, + module_regex="model\.layers\.7", + trainable_params_regex=["model\.layers\.(5|6|7)\.mlp\.down_proj\.weight"], + steering_coeff=20, + *args, **kwargs): + """ + RMU Trainer that fine-tunes only specific layers and parameters using regex-based filtering. + + Args: + module_path (str): Regex pattern to match module names. + trainable_param_paths (list of str): List of regex patterns for trainable parameters. + """ + super().__init__(*args, **kwargs) + + # Create reference model if not already set + if self.ref_model is None: + self.ref_model = self._prepare_ref_model(self.model) + + # Unfreeze only the selected parameters + self.trainable_params_regex = trainable_params_regex # Regex for selecting params + + # Get actual module references + self.module_regex = module_regex # Regex for selecting modules + self.model_module = self._get_matching_module(self.model, self.module_regex) + self.ref_module = self._get_matching_module(self.ref_model, self.module_regex) + self.steering_coeff = steering_coeff + self.control_vec = None + + + def create_optimizer(self): + self._freeze_all_params(self.model, False) + # This makes the optimizer to select only trainable params + self._set_trainable_params(self.model, self.trainable_params_regex, True) + super().create_optimizer() + self._freeze_all_params(self.model, True) + + + def _get_matching_module(self, model, module_regex): + """Returns a single module matching the given regex from a DeepSpeed/DDP-wrapped model.""" + # Handle DeepSpeed and DDP-wrapped models by accessing the underlying module + if isinstance(model, deepspeed.DeepSpeedEngine): + model = model.module # Extract the actual PyTorch model inside + + matched_modules = {name: module for name, module in model.named_modules() if re.fullmatch(module_regex, name)} + + if len(matched_modules) > 1: + raise ValueError(f"More than one module matched with {module_regex}: {list(matched_modules.keys())}") + elif not matched_modules: + raise ValueError(f"No module matched with {module_regex}") + + return next(iter(matched_modules.values())) # Return the single matched module + + def _freeze_all_params(self, model, requires_grad=True): + """Freeze all parameters in the model initially.""" + for param in model.parameters(): + param.requires_grad = requires_grad + + def _set_trainable_params(self, model, trainable_params_regex, requires_grad=True): + """Unfreeze specific parameters that match the regex patterns.""" + for name, param in model.named_parameters(): + if any(re.fullmatch(pattern, name) for pattern in trainable_params_regex): + param.requires_grad = requires_grad + # print(f"{name}:requires_grad\t{requires_grad}") + + def forward_with_cache(self, model, inputs, module, no_grad=True): + """Performs a forward pass while caching the output of a specified module.""" + cache = [] + def hook(module, input, output): + if isinstance(output, tuple): + cache.append(output[0]) + else: + cache.append(output) + return None + + hook_handle = module.register_forward_hook(hook) + with torch.set_grad_enabled(not(no_grad)): + outputs = model(**inputs) + hook_handle.remove() + return cache[0], outputs + + def get_control_vector(self, dim): + if self.control_vec is None: + random_vector = torch.rand(1,1, dim) + self.control_vec = random_vector / torch.norm(random_vector) * self.steering_coeff + return self.control_vec + + + def compute_activation_loss(self, activation1, activation2, mask): + squared_diff = torch.nn.functional.mse_loss(activation1, activation2, reduction="none") # Shape (b, s, d) + expanded_mask = mask.unsqueeze(-1).expand_as(squared_diff) # Shape: [b, s, d] + squared_diff_sum = (squared_diff * expanded_mask).mean(dim=2).sum(dim=(1)) # Shape: [b, 1] + num_tokens = mask.sum(dim=-1, keepdim=True) # Sum over seq_len, Shape: [b, 1] + return (squared_diff_sum / num_tokens).mean() + + def compute_retain_loss(self, model, retain_inputs): + retain_loss = 0.0 + + if self.retain_loss_type == "EMBED_DIFF": + model_retain_activations, _ = self.forward_with_cache(model, retain_inputs, module=self.model_module, no_grad=False) + ref_retain_activations, _ = self.forward_with_cache(self.ref_model, retain_inputs, module=self.ref_module, no_grad=True) + mask = (retain_inputs['labels'] != -100) # Shape: [b, s] + retain_loss = self.compute_activation_loss(model_retain_activations, ref_retain_activations.to(model_retain_activations.device), mask) + else: + retain_loss = super().compute_retain_loss(model, retain_inputs) + return retain_loss + + def compute_loss(self, model, inputs, return_outputs=False): + forget_inputs = inputs["forget"] + forget_inputs = { + "input_ids": forget_inputs["input_ids"], + "attention_mask": forget_inputs["attention_mask"], + "labels": forget_inputs["labels"], + } + + model_forget_activations, forget_outputs = self.forward_with_cache(model, forget_inputs, self.model_module, no_grad=False) + # If multiple datasets or concepts need unlearning, pass the control vector during processing; otherwise, default to a random vector during training. + control_vec = forget_inputs.get("control_vec", self.get_control_vector(model_forget_activations.shape[-1])) + control_vec = control_vec.to(dtype=model_forget_activations.dtype, device=model_forget_activations.device) + control_vec = control_vec.expand_as(model_forget_activations) + mask = (forget_inputs['labels'] != -100) # Shape: [b, s] + forget_loss = self.compute_activation_loss(model_forget_activations, control_vec, mask) + + retain_inputs = inputs["retain"] + retain_inputs = { + "input_ids": retain_inputs["input_ids"], + "attention_mask": retain_inputs["attention_mask"], + "labels": retain_inputs["labels"], + } + retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) + + loss = self.gamma * forget_loss + self.alpha * retain_loss + + return (loss, forget_outputs) if return_outputs else loss From dccb831590114c7714f540f19c4d1b9c60b26fea Mon Sep 17 00:00:00 2001 From: Vineeth <48151992+Dornavineeth@users.noreply.github.com> Date: Thu, 27 Mar 2025 16:12:33 -0400 Subject: [PATCH 08/15] Add structure to contributions, setup leaderboard, update documentation (#8) * docs: updates, small corrections, re-formats * modified ruff commands * modified ruff commands * CI/CD minor updates * added contributing + leaderboard * fix minor spelling misatkes * docs: bunch of minor updates * docs fixes --------- Co-authored-by: molereddy --- .github/ISSUE_TEMPLATE/feature-request.yaml | 9 + .github/PULL_REQUEST_TEMPLATE.md | 5 +- .github/workflows/tests.yml | 7 +- .pre-commit-config.yaml | 6 +- Makefile | 6 +- README.md | 50 +++-- community/benchmarks/template/README.md | 51 +++++ community/benchmarks/template/run.sh | 18 ++ community/leaderboard.md | 168 ++++++++++++++++ community/methods/template/README.md | 39 ++++ community/methods/template/run.sh | 13 ++ docs/components.md | 4 +- docs/contributing.md | 207 ++++++++++++++++++++ docs/experiments.md | 4 +- docs/{results.md => repro.md} | 2 + setup.py | 9 +- setup_data.py | 2 +- src/trainer/unlearn/rmu.py | 105 ++++++---- 18 files changed, 620 insertions(+), 85 deletions(-) create mode 100644 community/benchmarks/template/README.md create mode 100644 community/benchmarks/template/run.sh create mode 100644 community/leaderboard.md create mode 100644 community/methods/template/README.md create mode 100644 community/methods/template/run.sh create mode 100644 docs/contributing.md rename docs/{results.md => repro.md} (98%) diff --git a/.github/ISSUE_TEMPLATE/feature-request.yaml b/.github/ISSUE_TEMPLATE/feature-request.yaml index 22ec6712..e43e9138 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yaml +++ b/.github/ISSUE_TEMPLATE/feature-request.yaml @@ -32,3 +32,12 @@ body: label: Motivation description: | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. + + - type: textarea + id: implementation + validations: + required: false + attributes: + label: Implementation + description: | + Please describe your proposed solution in detail. Outline the implementation approach, including any key technical considerations. If there are challenges or blockers preventing implementation, specify them along with potential workarounds or dependencies. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index fc69076e..fbc9a07b 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -6,6 +6,5 @@ Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). -- [ ] Have you gone through the documentation of adding new [components](../docs/components.md)? -- [ ] Did you make sure to update the documentation with your changes? Here are the pointers to documentation - [documentation guidelines](../README.md#-further-documentation). \ No newline at end of file +- [ ] Have you gone through the contributions [guide](../docs/contributing.md)? +- [ ] Are your changes documented? Read documentation guidelines [here](../README.md#-further-documentation). \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 42770425..f1b0afa6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -46,16 +46,11 @@ jobs: run: | python -m pip install --upgrade pip pip install ruff==0.6.6 - # python -m pip install git+https://github.com/huggingface/transformers.git - # python -m pip install ".[torch,dev]" - name: Check Quality run: make quality - # - name: Check Style - # run: make style - # - name: Test with pytest # run: | - # cd LLaMA-Factory + # cd # make test diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 97a73417..97b7cdf4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,6 @@ repos: rev: v0.6.9 hooks: - id: ruff - args: [--fix, scripts, src] - - id: ruff-format - args: [scripts, src] \ No newline at end of file + args: [check, --fix, scripts, src, setup.py, setup_data.py] + - id: ruff + args: [format, scripts, src, setup.py setup_data.py] \ No newline at end of file diff --git a/Makefile b/Makefile index 179c2d59..11bdbb12 100644 --- a/Makefile +++ b/Makefile @@ -3,10 +3,12 @@ check_dirs := scripts src #setup.py quality: - ruff check $(check_dirs) + ruff check $(check_dirs) setup.py setup_data.py + ruff format --check $(check_dirs) setup.py setup_data.py style: - ruff --format $(check_dirs) + ruff check $(check_dirs) setup.py setup_data.py --fix + ruff format $(check_dirs) setup.py setup_data.py test: CUDA_VISIBLE_DEVICES= pytest tests/ diff --git a/README.md b/README.md index 403754b8..986b874f 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,17 @@ ## πŸ“– Overview -We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 7+ LLMs. Each of these can be easily extended to incorporate more variants. +We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 6+ LLM architectures. Each of these can be easily extended to incorporate more variants. We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field. -> ⚠️ **Notice (Updated: February 27, 2025)** -> This repository replaces the original TOFU codebase, which can be found at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu) and isn't maintained anymore. +### πŸ“’ Updates + +#### [Mar 27, 2025] +- **Easier contributions, leaderboard and reproducibility**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details. + +#### [Feb 27, 2025] +⚠️ **Repository Update**: This repo replaces the original TOFU codebase at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu), which is no longer maintained. ## πŸ—ƒοΈ Available Components @@ -38,22 +43,21 @@ We provide several variants for each of the components in the unlearning pipelin | **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | | **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, MIA Attacks, TruthRatio, Model Utility | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) | -| **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2, ICLM; Additional: Phi-3.5, Phi-1.5, Gemma | +| **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma | --- ## πŸ“Œ Table of Contents - πŸ“– [Overview](#-overview) +- πŸ“’ [Updates](#-updates) - πŸ—ƒοΈ [Available Components](#%EF%B8%8F-available-components) - ⚑ [Quickstart](#-quickstart) - - πŸ› οΈ [Environment Setup](#-environment-setup) - - πŸ’Ύ [Data Setup](#-data-setup) - πŸ”„ [Updated TOFU benchmark](#-updated-tofu-benchmark) - πŸ§ͺ [Running Experiments](#-running-experiments) - πŸš€ [Perform Unlearning](#-perform-unlearning) - πŸ“Š [Perform an Evaluation](#-perform-an-evaluation) - πŸ“œ [Running Baseline Experiments](#-running-baseline-experiments) -- βž• [How to Add New Components](#-how-to-add-new-components) +- βž• [How to Contribute](#-how-to-contribute) - πŸ“š [Further Documentation](#-further-documentation) - πŸ”— [Support & Contributors](#-support--contributors) - πŸ“ [Citing this work](#-citing-this-work) @@ -64,20 +68,16 @@ We provide several variants for each of the components in the unlearning pipelin ## ⚑ Quickstart -### πŸ› οΈ Environment Setup - ```bash +# environment setup conda create -n unlearning python=3.11 conda activate unlearning pip install . pip install --no-build-isolation flash-attn==2.6.3 -``` - -### πŸ’Ύ Data Setup -Download the log files containing metric results from the models used in the supported benchmarks (including the retain model logs used to compare the unlearned models against). -```bash -python setup_data.py # populates saves/eval with evaluation results of the uploaded models +# data setup +python setup_data.py # saves/eval now contains evaluation results of the uploaded models +# Downloads log files with metric eval results (incl retain model logs) from the models used in the supported benchmarks. ``` --- @@ -103,7 +103,7 @@ python src/train.py --config-name=unlearn.yaml experiment=unlearn/tofu/default \ forget_split=forget10 retain_split=retain90 trainer=GradAscent task_name=SAMPLE_UNLEARN ``` -- `experiment`- Path to the Hydra config file [`configs/experiment/unlearn/muse/default.yaml`](configs/experiment/unlearn/tofu/default.yaml) with default experimental settings for TOFU unlearning, e.g. train dataset, eval benchmark details, model paths etc.. +- `experiment`- Path to the Hydra config file [`configs/experiment/unlearn/tofu/default.yaml`](configs/experiment/unlearn/tofu/default.yaml) with default experimental settings for TOFU unlearning, e.g. train dataset, eval benchmark details, model paths etc.. - `forget_split/retain_split`- Sets the forget and retain dataset splits. - `trainer`- Load [`configs/trainer/GradAscent.yaml`](configs/trainer/GradAscent.yaml) and override the unlearning method with the handler (see config) implemented in [`src/trainer/unlearn/grad_ascent.py`](src/trainer/unlearn/grad_ascent.py). @@ -126,24 +126,21 @@ For more details about creating and running evaluations, refer [`docs/evaluation ### πŸ“œ Running Baseline Experiments -The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. The expected results for these are in [`docs/results.md`](docs/results.md). +The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. The expected results for these are in [`docs/repro.md`](docs/repro.md). ```bash bash scripts/tofu_unlearn.sh bash scripts/muse_unlearn.sh ``` ---- +The above scripts are not tuned and uses default hyper parameter settings. We encourage you to tune your methods and add your final results in [`community/leaderboard.md`](community/leaderboard.md). -## βž• How to Add New Components +--- -Adding a new component (trainer, evaluation metric, benchmark, model, or dataset) requires defining a new class, registering it, and creating a configuration file. Learn more about adding new components in [`docs/components.md`](docs/components.md). +## βž• How to Contribute -Please feel free to raise a pull request for any new features after setting up the environment in development mode. +If you are interested in contributing to our work, please have a look at [`contributing.md`](docs/contributing.md) guide. -```bash -pip install .[dev] -``` ## πŸ“š Further Documentation @@ -151,11 +148,12 @@ For more in-depth information on specific aspects of the framework, refer to the | **Documentation** | **Contains** | |------------------------------------------------|--------------------------------------------------------------------------------------------------------------------| -| [`docs/components.md`](docs/components.md) | Instructions on how to add new components such as trainers, benchmarks, metrics, models, datasets, etc. | +| [`docs/contributing.md`](docs/contributing.md) | Instructions on how to add new methods, benchmarks, components such as trainers, benchmarks, metrics, models, datasets, etc. | | [`docs/evaluation.md`](docs/evaluation.md) | Detailed instructions on creating and running evaluation metrics and benchmarks. | | [`docs/experiments.md`](docs/experiments.md) | Guide on running experiments in various configurations and settings, including distributed training, fine-tuning, and overriding arguments. | | [`docs/hydra.md`](docs/hydra.md) | Explanation of the Hydra features used in configuration management for experiments. | -| [`docs/results.md`](docs/results.md) | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks. | +| [`community/leaderboard.md`](community/leaderboard.md) | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks. | +| [`docs/repro.md`](docs/repro.md) (deprecated) | Results are provided solely for reproducibility purposes, without any parameter tuning. | --- ## πŸ”— Support & Contributors diff --git a/community/benchmarks/template/README.md b/community/benchmarks/template/README.md new file mode 100644 index 00000000..855952f5 --- /dev/null +++ b/community/benchmarks/template/README.md @@ -0,0 +1,51 @@ +# TITLE + +- Paper title, authors, links. + +Provide a concise summary of your benchmark details and its contributions. Please avoid using images to keep the repository size manageable. + +# Datasets + +Use a clear and consistent naming convention for dataset splits. + +- [ ] Provide a link to find/download the datasets (preferably HuggingFace). + +# Models + + +- [ ] Upload any unlearning target or reference retain models for unlearning preferably on HuggingFace and provide the path. +- [ ] Model creation details and how they fit in benchmark. + +# Baselines & Results + +Discuss the baselines used and their results. + + +## Setup +Please include the experimental setup for the baselines + +- [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc. +- [ ] **Computational Setup:** Mention the type and number of GPUs used. +- [ ] **DeepSpeed Configuration:** If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.) +- [ ] **Other Details:** Any additional setup details crucial for reproducing your method. + +To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented. + + +# Citation + + +If you use this work, please cite: + +```bibtex + + + +@misc{openunlearning2025, + title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks}, + author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush}, + year={2025}, + howpublished={\url{https://github.com/locuslab/open-unlearning}}, + note={Accessed: February 27, 2025} +} +``` \ No newline at end of file diff --git a/community/benchmarks/template/run.sh b/community/benchmarks/template/run.sh new file mode 100644 index 00000000..a5335b28 --- /dev/null +++ b/community/benchmarks/template/run.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +######################################################################################################################## +########################################### RETAIN Finetuned ####$###################################################### +######################################################################################################################## + + + +######################################################################################################################### +############################################ FULL Finetuned models ###################################################### +######################################################################################################################### + + + + +######################################################################################################################### +############################################ Baseline methods ####$###################################################### +######################################################################################################################### diff --git a/community/leaderboard.md b/community/leaderboard.md new file mode 100644 index 00000000..88039321 --- /dev/null +++ b/community/leaderboard.md @@ -0,0 +1,168 @@ +
+ +# Leaderboard + +
+ +We encourage the community to develop new methods, optimize them for specific benchmarks, and compare results with existing approaches. + +To implement a new method, refer to our [contributing guide](../docs/contributing.md). + +> **Note:** The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date. + + +### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methodforget01forget05forget10
forget_qualitymodel_utilityforget_qualitymodel_utilityforget_qualitymodel_utility
Finetuned0.010.602.96e-130.68.08e-220.6
Retain1.00.601.00.61.00.59
+
+ + + +### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methodforget01forget05forget10
forget_qualitymodel_utilityforget_qualitymodel_utilityforget_qualitymodel_utility
Finetuned0.010.602.96e-130.68.08e-220.6
Retain1.00.601.00.61.00.59
+
+ + +### MUSE unlearning on the benchmark's target models + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MethodNewsBooks
forget_knowmem_ROUGEforget_verbmem_ROUGEprivleakretain_knowmem_ROUGEforget_knowmem_ROUGEforget_verbmem_ROUGEprivleakretain_knowmem_ROUGE
Finetuned0.640.58-99.810.550.471.0-57.260.69
Retain0.330.2100.560.30.1400.69
+
diff --git a/community/methods/template/README.md b/community/methods/template/README.md new file mode 100644 index 00000000..7facb01b --- /dev/null +++ b/community/methods/template/README.md @@ -0,0 +1,39 @@ +# TITLE + +- Paper title, authors, links. + + +Provide a concise summary of your method details and its contributions. Please avoid using images to keep the repository size manageable. + +# Setup + +Please include the experimental setup such as + +- [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc. +- [ ] **Computational Setup:** Mention the type and number of GPUs used. +- [ ] **DeepSpeed Configuration:** If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.) +- [ ] **Other Details:** Any additional setup details crucial for reproducing your method. + +# Results + +To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented. + +It would be appreciated if you can upload the final unlearned model(s) along with their `evals` folders to HuggingFace and provide the link(s) here. As the evaluations are updated, this would help us re-evaluate your model(s). + +# Citation + + +If you use this work, please cite: + +```bibtex + + + +@misc{openunlearning2025, + title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks}, + author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush}, + year={2025}, + howpublished={\url{https://github.com/locuslab/open-unlearning}}, + note={Accessed: February 27, 2025} +} +``` \ No newline at end of file diff --git a/community/methods/template/run.sh b/community/methods/template/run.sh new file mode 100644 index 00000000..8c165e51 --- /dev/null +++ b/community/methods/template/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +######################################################################################################################## +########################################### Hyper parameter tuning ##################################################### +######################################################################################################################## + +# Optional + +######################################################################################################################## +########################################### Final best parameters ##################################################### +######################################################################################################################## + +# Required to replicate your results \ No newline at end of file diff --git a/docs/components.md b/docs/components.md index e75f8dfe..016932c9 100644 --- a/docs/components.md +++ b/docs/components.md @@ -19,7 +19,7 @@ This process involves three main steps: 6. [Collator](#collator) - Handles data collation logic 7. [Experiment](#experiment) - Combines components into a final experiment config -__Note:__ adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md). +> [!Note] adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md). --- @@ -147,7 +147,7 @@ To add a new model architecture: ### Implement and register a handler For all the models currently supported, HuggingFace's `AutoModelForCausalLM` and `AutoTokenizer` are used, and therefore the user doesn't need to create or register any handler. -__Note__: Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next. +> [!Note]: Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next. ### Add to configs Model configurations contain details required to load the model+tokenizer such as paths, chat templating arguments, LoRA parameters etc. in [`configs/models`](../configs/models/). diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 00000000..3e398f59 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,207 @@ +# Contributing + +Everyone is welcome to contribute, and every contribution is valued. Aside from coding components, answering questions, assisting others, and improving documentation are all appreciated. + +You can also help by spreading the word! If you find this project useful, please share it with others, cite it, link it on your repositories and posts, or simply ⭐️ the repo to show your support. + +> 🀝 This guide is heavily borrowed from awesome [transformers](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) guide to contributing. + +## Ways to Contribute + +There are several ways you can contribute to OpenUnlearning: + +* Fix issues with the existing code. +* Submit issues related to bugs or desired new features. +* Support new components (models, datasets, collator etc). +* Implement new unlearning methods. +* Implement new evaluations. +* Contribute to the documentation. + +## Fixing Issues + +If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](#create-a-pull-request) and open a Pull Request! + +## Submitting a Bug-Related Issue or Feature Request + +Do your best to follow these guidelines when submitting a bug-related issue or a feature request. It will make it easier for us to come back to you quickly and with good feedback. + +### Did You Find a Bug? + +Before you report an issue, we would really appreciate it if you could **make sure the bug was not already reported** (use the search bar on GitHub under Issues). Please try to ensure that the bug is in OpenUnlearning itself, and not your code. + +Please include the following information in your issue so we can quickly resolve it: + +* A short, self-contained, code snippet that allows us to reproduce the bug. +* The **full** traceback if an exception is raised. +* The hardware used to run the experiment, including specifications such as the number and type of GPUs etc. +* The hydra config file corresponding to the experiment if needed (since these files ae long you may link them or use a markdown dropdown in your issue). +* Attach any other additional information, like screenshots, you think may help. + +### Do You Want a New Feature? + +If there is a new feature you'd like to see in OpenUnlearning, please open an issue and describe: + +1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it something you worked on and think it could benefit the community? + + Whatever it is, we'd love to hear about it! + +2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you. +3. Provide a *code snippet* that demonstrates the features usage. +4. If the feature is related to a paper, please include a link. + +## Do You Want to Support New Components? + +Adding a new component listed below requires defining a new class, registering it, and creating a configuration file. Learn more about adding new components in [`docs/components.md`](docs/components.md). + +1. [Trainer](components#trainer) - Algorithm used in LLM training or unlearning +2. [Dataset](components#dataset) - Dataset class for preprocessing raw data +3. [Evaluation Metric](components#evaluation-metric) - Metric class implementing model evaluation +4. [Benchmark](components#benchmark) - Suite combining multiple evaluation metrics +5. [Model](components#model) - LLM used in unlearning +6. [Collator](components#collator) - Handles data collation logic +7. [Experiment](components#experiment) - Combines components into a final experiment config + +> **IMPORTANT** πŸš€ +> **We especially encourage** contributions of methods and benchmarks that you've created, since you best understand them and know how to use them. We are ready to expedite their integration into OpenUnlearning. +> When facing difficulties implementing any component, please contact the maintainers to join our discord where we can go in detail with the implementations. + +## Contributing a New Unlearning Method + +### 1. Implement an Unlearning Trainer + +Your method might require a custom loss function, or other trainer related modifications which go here. +Refer to our [Trainer implementation guide](components.md#trainer) to ensure your method integrates well with our framework. + +### 2. Detail Commands to Be Run + +Some methods might involve multiple commands or steps while unlearning: ensure you write a clear `.sh` file that documents this. + +### 3. Run and Tune Your Method on Relevant Benchmarks + +- Once implemented, evaluate your method on applicable benchmarks using the best possible parameters. +- Create a folder [`community/methods/`](../community/methods) and include a README file in it, explaining the method details, hyper-parameters, strategy/logic for selecting the best model for unlearning etc. +- Include a bash script `run.sh` with the exact bash command needed to replicate your results. + +### 4. Update Leaderboard and Upload Model + +Don't forget to add your results to the [leaderboard](results.md) and upload your unlearned model to HuggingFace for broader accessibility and reproducibility. + +```bash +pip install huggingface_hub +huggingface-cli login + +huggingface-cli repo create {benchmark}-{model}-{datasplit}-{method} +cd + +git init +git remote add origin https://huggingface.co//{benchmark}-{model}-{datasplit}-{method} +git add . +git commit -m "Initial commit" +git push origin main +``` + +--- + +## Contributing to Unlearning Benchmark Evaluations + +Evaluating LLM unlearning is essential for assessing the effectiveness of different unlearning methods. While various benchmarks and metrics exist, identifying the most suitable ones for capturing the nuances of unlearning remains an open challenge. + +Your contributions toward defining or improving evaluation methods can significantly advance unlearning research. By proposing reliable benchmarks, you help ensure that unlearning methods are both effective and aligned with real-world requirements. + +- To add a new unlearning evaluation metric, refer to our [Metric Implementation Guide]((components.md#evaluation-metric).). +- To integrate new datasets and models, follow our [Components Guide](components.md). + +### Steps to add a new Unlearning Benchmark + +1. **Prepare Datasets & Models** – Create your dataset and train models to generate fine-tuned or retained models. +2. **Define a New Benchmark** (if needed) – Follow the [Benchmark Guide]((components.md#benchmark)) to implement a new evaluation benchmark. +3. **Run and Tune Baseline Methods** – Evaluate existing unlearning methods on your benchmark and optimize them. +4. **Document & Share Findings** – Provide detailed steps for reproduction in [`community/benchmarks/`](../community/benchmarks). + +--- + +## Do You Want to Add Documentation? + +We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution! + +--- + +## Create a Pull Request + +Before writing any code, we strongly advise you to search through the existing PRs or issues to make sure nobody is already working on the same thing. If you are unsure, it is always a good idea to open an issue to get some feedback. + +Follow the steps below to start contributing: + +1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code under your GitHub user account. + +2. Clone your fork to your local disk, and add the base repository as a remote: + + ```bash + git clone git@github.com:/open-unlearning.git + cd open-unlearning + git remote add upstream https://github.com/locuslab/open-unlearning.git + ``` + +3. You can work on the forked main branch or create a new branch to hold your development changes: + + ```bash + git checkout -b a-descriptive-name-for-my-changes + ``` + +4. Set up the environment in dev mode after following steps in [Quick Start](../README.md#-quickstart). This installs other packages such as `ruff`, `precommit` etc. + + ```bash + pip install .[dev] + ``` + +5. Develop the features in your fork/branch. + + As you work on your code, you should make sure the code is linted and formatted correctly. + + OpenUnlearning relies on `ruff` to lint & format its source code consistently. After you make changes, to check the quality of code, run + + ```bash + make quality + ``` + + If you prefer to apply the style corrections: + + ```bash + make style + ``` + + Once you're happy with your changes, add the changed files with `git add` and record your changes locally with `git commit`: + + ```bash + git add modified_file.py + git commit + ``` + + Please remember to write [good commit messages](https://chris.beams.io/posts/git-commit/) to clearly communicate the changes you made! + + To keep your copy of the code up to date with the original repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer: + + ```bash + git fetch upstream + git rebase upstream/main + ``` + + Push your changes to your branch: + + ```bash + git push -u origin a-descriptive-name-for-my-changes + ``` + + If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally. + +6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review. + +7. Please bear with us maintainers with the changes we require! We want to ensure we keep the repository clean and easily extensible. As you make your updates: you may want to work in your local branch and push the changes to your fork, since everyone can see the changes in the pull request. Changes pushed to the fork will automatically appear in the pull request. + +### Pull Request Checklist + +☐ The pull request title should summarize your contribution. +☐ If your pull request addresses an issue, please mention the issue number in the pull request description to make sure they are linked (and people viewing the issue know you are working on it). +☐ To indicate a work in progress please prefix the title with `[WIP]`. These are useful to avoid duplicated work, and to differentiate it from PRs ready to be merged. +☐ Make sure existing tests and checks, if any, pass. +☐ Make methods having informative docstrings. \ No newline at end of file diff --git a/docs/experiments.md b/docs/experiments.md index 86109614..b570d3e8 100644 --- a/docs/experiments.md +++ b/docs/experiments.md @@ -59,7 +59,7 @@ paths.output_dir=saves/unlearn/NPO/evals ``` -**Note:** The unlearning experiments support evaluation during the unlearning training. But this is supported only on a single GPU, evaluation can be performed during unlearning itself. When multiple GPUs are used to train, checkpoints must be stored and evaluated after training. +> [!Note]: The unlearning experiments support evaluation during the unlearning finetuning. But this is supported only on a single GPU When multiple GPUs are used to train, checkpoints must be stored and evaluated after training. --- @@ -242,7 +242,7 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \ src/train.py --config-name=unlearn.yaml experiment=unlearn/muse/default.yaml task_name=DISTRIBUTED_TRAIN ``` -**Note:** Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference): +> [!Note]: Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference): ```bash CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/muse/default.yaml task_name=SAMPLE_EVAL diff --git a/docs/results.md b/docs/repro.md similarity index 98% rename from docs/results.md rename to docs/repro.md index 3af7cb66..ac64ac38 100644 --- a/docs/results.md +++ b/docs/repro.md @@ -4,6 +4,8 @@ +>​For results where methods have been tuned for optimal performance, please refer to the [`community/leaderboard`](../community/leaderboard.md). + The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. ```bash bash scripts/tofu_unlearn.sh diff --git a/setup.py b/setup.py index 79c6dbc9..209335c1 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,8 @@ setup( name="open-unlearning", version="0.1.0", - author="Vineeth Dorna, Anmol Reddy Mekala", - author_email="vineethdornal@gmail.com, m.anmolreddy@gmail.com", + author="Vineeth Dorna, Anmol Mekala", + author_email="vineethdorna@gmail.com, m.anmolreddy@gmail.com", description="A library for machine unlearning in LLMs.", long_description=open("README.md").read(), long_description_content_type="text/markdown", @@ -16,7 +16,10 @@ packages=find_packages(), install_requires=requirements, # Uses requirements.txt extras_require={ - "dev": ["pre-commit==4.0.1"], # Install using `pip install .[dev]` + "dev": [ + "pre-commit==4.0.1", + "ruff==0.6.9", + ], # Install using `pip install .[dev]` }, python_requires=">=3.11", ) diff --git a/setup_data.py b/setup_data.py index 358779c3..760679b7 100644 --- a/setup_data.py +++ b/setup_data.py @@ -14,4 +14,4 @@ allow_patterns="*.jsonl", repo_type="dataset", local_dir="data", -) \ No newline at end of file +) diff --git a/src/trainer/unlearn/rmu.py b/src/trainer/unlearn/rmu.py index 391bd6ad..d990d3a3 100644 --- a/src/trainer/unlearn/rmu.py +++ b/src/trainer/unlearn/rmu.py @@ -7,11 +7,14 @@ class RMU(GradDiff): - def __init__(self, - module_regex="model\.layers\.7", - trainable_params_regex=["model\.layers\.(5|6|7)\.mlp\.down_proj\.weight"], - steering_coeff=20, - *args, **kwargs): + def __init__( + self, + module_regex="model\.layers\.7", + trainable_params_regex=["model\.layers\.(5|6|7)\.mlp\.down_proj\.weight"], + steering_coeff=20, + *args, + **kwargs, + ): """ RMU Trainer that fine-tunes only specific layers and parameters using regex-based filtering. @@ -20,40 +23,46 @@ def __init__(self, trainable_param_paths (list of str): List of regex patterns for trainable parameters. """ super().__init__(*args, **kwargs) - + # Create reference model if not already set if self.ref_model is None: self.ref_model = self._prepare_ref_model(self.model) # Unfreeze only the selected parameters - self.trainable_params_regex = trainable_params_regex # Regex for selecting params - + self.trainable_params_regex = ( + trainable_params_regex # Regex for selecting params + ) + # Get actual module references self.module_regex = module_regex # Regex for selecting modules self.model_module = self._get_matching_module(self.model, self.module_regex) self.ref_module = self._get_matching_module(self.ref_model, self.module_regex) self.steering_coeff = steering_coeff self.control_vec = None - - + def create_optimizer(self): self._freeze_all_params(self.model, False) # This makes the optimizer to select only trainable params self._set_trainable_params(self.model, self.trainable_params_regex, True) super().create_optimizer() self._freeze_all_params(self.model, True) - - + def _get_matching_module(self, model, module_regex): """Returns a single module matching the given regex from a DeepSpeed/DDP-wrapped model.""" # Handle DeepSpeed and DDP-wrapped models by accessing the underlying module if isinstance(model, deepspeed.DeepSpeedEngine): model = model.module # Extract the actual PyTorch model inside - matched_modules = {name: module for name, module in model.named_modules() if re.fullmatch(module_regex, name)} + matched_modules = { + name: module + for name, module in model.named_modules() + if re.fullmatch(module_regex, name) + } if len(matched_modules) > 1: - raise ValueError(f"More than one module matched with {module_regex}: {list(matched_modules.keys())}") + raise ValueError( + f"More than one module matched with {module_regex}: {list(matched_modules.keys())}" + ) elif not matched_modules: raise ValueError(f"No module matched with {module_regex}") @@ -70,45 +79,59 @@ def _set_trainable_params(self, model, trainable_params_regex, requires_grad=Tru if any(re.fullmatch(pattern, name) for pattern in trainable_params_regex): param.requires_grad = requires_grad # print(f"{name}:requires_grad\t{requires_grad}") - + def forward_with_cache(self, model, inputs, module, no_grad=True): """Performs a forward pass while caching the output of a specified module.""" cache = [] + def hook(module, input, output): if isinstance(output, tuple): cache.append(output[0]) else: cache.append(output) - return None - + return None + hook_handle = module.register_forward_hook(hook) - with torch.set_grad_enabled(not(no_grad)): + with torch.set_grad_enabled(not (no_grad)): outputs = model(**inputs) hook_handle.remove() return cache[0], outputs - + def get_control_vector(self, dim): if self.control_vec is None: - random_vector = torch.rand(1,1, dim) - self.control_vec = random_vector / torch.norm(random_vector) * self.steering_coeff + random_vector = torch.rand(1, 1, dim) + self.control_vec = ( + random_vector / torch.norm(random_vector) * self.steering_coeff + ) return self.control_vec - def compute_activation_loss(self, activation1, activation2, mask): - squared_diff = torch.nn.functional.mse_loss(activation1, activation2, reduction="none") # Shape (b, s, d) + squared_diff = torch.nn.functional.mse_loss( + activation1, activation2, reduction="none" + ) # Shape (b, s, d) expanded_mask = mask.unsqueeze(-1).expand_as(squared_diff) # Shape: [b, s, d] - squared_diff_sum = (squared_diff * expanded_mask).mean(dim=2).sum(dim=(1)) # Shape: [b, 1] + squared_diff_sum = ( + (squared_diff * expanded_mask).mean(dim=2).sum(dim=(1)) + ) # Shape: [b, 1] num_tokens = mask.sum(dim=-1, keepdim=True) # Sum over seq_len, Shape: [b, 1] return (squared_diff_sum / num_tokens).mean() - + def compute_retain_loss(self, model, retain_inputs): retain_loss = 0.0 - + if self.retain_loss_type == "EMBED_DIFF": - model_retain_activations, _ = self.forward_with_cache(model, retain_inputs, module=self.model_module, no_grad=False) - ref_retain_activations, _ = self.forward_with_cache(self.ref_model, retain_inputs, module=self.ref_module, no_grad=True) - mask = (retain_inputs['labels'] != -100) # Shape: [b, s] - retain_loss = self.compute_activation_loss(model_retain_activations, ref_retain_activations.to(model_retain_activations.device), mask) + model_retain_activations, _ = self.forward_with_cache( + model, retain_inputs, module=self.model_module, no_grad=False + ) + ref_retain_activations, _ = self.forward_with_cache( + self.ref_model, retain_inputs, module=self.ref_module, no_grad=True + ) + mask = retain_inputs["labels"] != -100 # Shape: [b, s] + retain_loss = self.compute_activation_loss( + model_retain_activations, + ref_retain_activations.to(model_retain_activations.device), + mask, + ) else: retain_loss = super().compute_retain_loss(model, retain_inputs) return retain_loss @@ -121,14 +144,22 @@ def compute_loss(self, model, inputs, return_outputs=False): "labels": forget_inputs["labels"], } - model_forget_activations, forget_outputs = self.forward_with_cache(model, forget_inputs, self.model_module, no_grad=False) + model_forget_activations, forget_outputs = self.forward_with_cache( + model, forget_inputs, self.model_module, no_grad=False + ) # If multiple datasets or concepts need unlearning, pass the control vector during processing; otherwise, default to a random vector during training. - control_vec = forget_inputs.get("control_vec", self.get_control_vector(model_forget_activations.shape[-1])) - control_vec = control_vec.to(dtype=model_forget_activations.dtype, device=model_forget_activations.device) + control_vec = forget_inputs.get( + "control_vec", self.get_control_vector(model_forget_activations.shape[-1]) + ) + control_vec = control_vec.to( + dtype=model_forget_activations.dtype, device=model_forget_activations.device + ) control_vec = control_vec.expand_as(model_forget_activations) - mask = (forget_inputs['labels'] != -100) # Shape: [b, s] - forget_loss = self.compute_activation_loss(model_forget_activations, control_vec, mask) - + mask = forget_inputs["labels"] != -100 # Shape: [b, s] + forget_loss = self.compute_activation_loss( + model_forget_activations, control_vec, mask + ) + retain_inputs = inputs["retain"] retain_inputs = { "input_ids": retain_inputs["input_ids"], @@ -136,7 +167,7 @@ def compute_loss(self, model, inputs, return_outputs=False): "labels": retain_inputs["labels"], } retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) - + loss = self.gamma * forget_loss + self.alpha * retain_loss return (loss, forget_outputs) if return_outputs else loss From 5a7dfb452e73ffe52f7e249040bb433612152884 Mon Sep 17 00:00:00 2001 From: dong-river Date: Wed, 2 Apr 2025 22:13:41 +0100 Subject: [PATCH 09/15] UNDIAL --- community/methods/UNDIAL/README.md | 24 +++++++ community/methods/UNDIAL/run.sh | 103 +++++++++++++++++++++++++++++ configs/trainer/UNDIAL.yaml | 12 ++++ src/trainer/unlearn/undial.py | 24 +++++++ 4 files changed, 163 insertions(+) create mode 100644 community/methods/UNDIAL/README.md create mode 100644 community/methods/UNDIAL/run.sh create mode 100644 configs/trainer/UNDIAL.yaml create mode 100644 src/trainer/unlearn/undial.py diff --git a/community/methods/UNDIAL/README.md b/community/methods/UNDIAL/README.md new file mode 100644 index 00000000..de906dd5 --- /dev/null +++ b/community/methods/UNDIAL/README.md @@ -0,0 +1,24 @@ +# UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models (NAACL 2025) + +- Authors: Yijiang River Dong, Hongzhou Lin, Mikhail Belkin, RamΓ³n Huerta, Ivan VuliΔ‡ +- Link​: https://arxiv.org/pdf/2402.10052 + +# Setup +- Hyperparameters: The original paper uses Llama-2 7B with LoRA to tune the model (rank=8, alpha=16) and learning rate of 1e-4. It's suggested to search the learning rate over [1e-5, 3e-4, 1e-4], epoch over [3,10], and use an effective batch size of 16 (batch_size * gradient_accumulation). The other important hyperparemeter is beta, the strength of penalty, which typically takes a number between [3,10,30]. If we change to other models, adjusting learning rate accordingly. + +- Computation Setup: All experiments are run on one A100. +- Other Details: The original paper does not use the retain set and aims to retain knowledge in all domains, not just on the retain set. So alpha is set to 0. Practionioners could search over the gemma to better retain the performance on the retain set. + +# Results +Run `run.sh` script. The results can be found in `leaderboard.md`. + +# Citation +@misc{dong2024undial, + title={UNDIAL: Self-Distillation with Adjusted Logits for Robust Unlearning in Large Language Models}, + author={Yijiang River Dong and Hongzhou Lin and Mikhail Belkin and Ramon Huerta and Ivan VuliΔ‡}, + year={2024}, + eprint={2402.10052}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2402.10052}, +} \ No newline at end of file diff --git a/community/methods/UNDIAL/run.sh b/community/methods/UNDIAL/run.sh new file mode 100644 index 00000000..a0b6815a --- /dev/null +++ b/community/methods/UNDIAL/run.sh @@ -0,0 +1,103 @@ +#!/bin/bash + + +models=( + "Llama-3.2-1B-Instruct" +) +trainers_experiments=( + "UNDIAL unlearn/tofu/default.yaml" +) +forget_retain_splits=( + "forget05 retain95" + "forget10 retain90" + "forget01 retain99" +) + +per_device_train_batch_size=4 # on two gpus would make effective batch size 32 +gradient_accumulation_steps=4 + + +######################################################################################################################## +########################################### Unlearn TOFU models ######################################################## +######################################################################################################################## + + +for split in "${forget_retain_splits[@]}"; do + forget_split=$(echo $split | cut -d' ' -f1) + retain_split=$(echo $split | cut -d' ' -f2) + for model in "${models[@]}"; do + for trainer_experiment in "${trainers_experiments[@]}"; do + trainer=$(echo $trainer_experiment | cut -d' ' -f1) + experiment=$(echo $trainer_experiment | cut -d' ' -f2) + + task_name=tofu_${model}_${forget_split}_${trainer} + model_path=open-unlearning/tofu_${model}_full + echo ${task_name}: Unlearning ${model_path} using ${trainer} + + # Unlearn + python src/train.py --config-name=unlearn.yaml \ + experiment=${experiment} \ + trainer=${trainer} \ + task_name=${task_name} \ + model=${model} \ + forget_split=${forget_split} \ + retain_split=${retain_split} \ + model.model_args.pretrained_model_name_or_path=${model_path} \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ + trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ + trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ + + # Eval + CUDA_VISIBLE_DEVICES=0 python src/eval.py \ + experiment=eval/tofu/default.yaml \ + forget_split=${forget_split} \ + model=${model} \ + task_name=${task_name} \ + model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ + paths.output_dir=saves/unlearn/${task_name}/evals \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json + done + done +done + +# ######################################################### +# #################### MUSE Unlearning #################### +# ######################################################### + + +model=Llama-3.2-1B-Instruct + +data_splits=( + "News" + "Books" +) + +trainers=( + "UNDIAL" +) + +for data_split in "${data_splits[@]}"; do + for trainer in "${trainers[@]}"; do + + task_name=muse_${model}_${data_split}_${trainer} + + python src/train.py --config-name=unlearn.yaml \ + experiment=unlearn/muse/default.yaml \ + model=${model} \ + data_split=${data_split} \ + trainer=${trainer} \ + task_name=${task_name} \ + retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \ + trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \ + trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \ + + CUDA_VISIBLE_DEVICES=0 python src/eval.py \ + experiment=eval/muse/default.yaml \ + data_split=${data_split} \ + task_name=${task_name} \ + model=${model} \ + model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ + paths.output_dir=saves/unlearn/${trainer}/evals \ + retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json + done +done diff --git a/configs/trainer/UNDIAL.yaml b/configs/trainer/UNDIAL.yaml new file mode 100644 index 00000000..c7884121 --- /dev/null +++ b/configs/trainer/UNDIAL.yaml @@ -0,0 +1,12 @@ +handler: UNDIAL # corresponds to the class defined in src/trainer/unlearn/grad_diff.py +args: # HuggingFace TrainingArguments + per_device_train_batch_size: 2 + per_device_eval_batch_size: 16 + gradient_accumulation_steps: 8 + learning_rate: 1e-4 + num_train_epochs: 10 +method_args: # Your own method-specific arguments + gamma: 1.0 + alpha: 0.0 + beta: 10.0 # the strength of penalty for memorized tokens + retain_loss_type: NLL \ No newline at end of file diff --git a/src/trainer/unlearn/undial.py b/src/trainer/unlearn/undial.py new file mode 100644 index 00000000..a82101ff --- /dev/null +++ b/src/trainer/unlearn/undial.py @@ -0,0 +1,24 @@ +from trainer.utils import compute_undial_loss +from trainer.unlearn.grad_diff import GradDiff + +class UNDIAL(GradDiff): + def __init__(self, beta=1.0, *args, **kwargs): + super().__init__(*args, **kwargs) + self.beta = beta + if self.ref_model is None: + self.ref_model = self._prepare_ref_model(self.model) + + def compute_loss(self, model, inputs, return_outputs=False): + forget_inputs = inputs["forget"] + forget_loss, forget_outputs = compute_undial_loss(model, self.ref_model, forget_inputs, self.beta) + + retain_inputs = inputs["retain"] + retain_inputs = { + "input_ids": retain_inputs["input_ids"], + "attention_mask": retain_inputs["attention_mask"], + "labels": retain_inputs["labels"], + } + retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) + + loss = self.gamma * forget_loss + self.alpha * retain_loss + return (loss, forget_outputs) if return_outputs else loss \ No newline at end of file From c4c800035b4f0a0a79fc23b29898e4ab674c8249 Mon Sep 17 00:00:00 2001 From: dong-river Date: Wed, 2 Apr 2025 22:20:38 +0100 Subject: [PATCH 10/15] UNDIAL2 --- src/trainer/__init__.py | 3 +++ src/trainer/utils.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py index 7e195fa9..b69ab604 100644 --- a/src/trainer/__init__.py +++ b/src/trainer/__init__.py @@ -10,6 +10,7 @@ from trainer.unlearn.dpo import DPO from trainer.unlearn.simnpo import SimNPO from trainer.unlearn.rmu import RMU +from trainer.unlearn.undial import UNDIAL TRAINER_REGISTRY: Dict[str, Any] = {} @@ -31,6 +32,7 @@ def load_trainer_args(trainer_args: DictConfig, dataset): // (batch_size * grad_accum_steps * num_devices) ) + trainer_args["output_dir"] = trainer_args.get("output_dir", "./output") trainer_args = TrainingArguments(**trainer_args) return trainer_args @@ -81,3 +83,4 @@ def load_trainer( _register_trainer(DPO) _register_trainer(SimNPO) _register_trainer(RMU) +_register_trainer(UNDIAL) \ No newline at end of file diff --git a/src/trainer/utils.py b/src/trainer/utils.py index c5125b78..7212d599 100644 --- a/src/trainer/utils.py +++ b/src/trainer/utils.py @@ -55,3 +55,34 @@ def compute_dpo_loss(model, ref_model, win_inputs=None, lose_inputs=None, beta=1 loss = -2 / beta * F.logsigmoid(beta * (win_log_ratio - lose_log_ratio)).mean() return loss, (win_outputs, lose_outputs) + +def compute_undial_loss(model, ref_model, inputs, beta): + # Forward pass on the student (trainable) model + outputs = model(**inputs) + logits = outputs.logits + labels = inputs["labels"] + + shift_labels = labels[..., 1:].contiguous() + shift_logits = logits[..., :-1, :].contiguous() + + # Forward pass on the teacher model (no grad) + with torch.no_grad(): + teacher_logits = ref_model(**inputs).logits + shift_teacher_logits = teacher_logits[..., :-1, :].contiguous() + + # Build the mask that identifies the tokens need to be unlearned + mask = torch.zeros_like(shift_teacher_logits) + batch_idx = torch.arange(mask.shape[0]).view(-1, 1, 1) + seq_idx = torch.arange(mask.shape[1]).view(1, -1, 1) + mask[batch_idx, seq_idx, shift_labels.unsqueeze(-1)] = 1.0 + + # Adjust teacher logits: subtract di_strength on the correct token + pre_softmax = shift_teacher_logits - mask * beta + soft_label = F.softmax(pre_softmax, dim=-1) + + loss_fct = nn.CrossEntropyLoss(reduction='none') + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), + soft_label.view(-1, soft_label.size(-1)), + ) + return loss.mean(), outputs \ No newline at end of file From 4aec929fa07887eaff0de1b5971b7707ff1b95e3 Mon Sep 17 00:00:00 2001 From: dong-river Date: Wed, 2 Apr 2025 22:21:57 +0100 Subject: [PATCH 11/15] UNDIAL3 --- src/trainer/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py index b69ab604..1b3f5aa5 100644 --- a/src/trainer/__init__.py +++ b/src/trainer/__init__.py @@ -32,7 +32,6 @@ def load_trainer_args(trainer_args: DictConfig, dataset): // (batch_size * grad_accum_steps * num_devices) ) - trainer_args["output_dir"] = trainer_args.get("output_dir", "./output") trainer_args = TrainingArguments(**trainer_args) return trainer_args From a08a26bb9b6cef35b3fdbb4bd7275517c7993c23 Mon Sep 17 00:00:00 2001 From: molereddy Date: Sun, 13 Apr 2025 16:41:00 +0000 Subject: [PATCH 12/15] Ruff quality formatting changes --- src/trainer/__init__.py | 2 +- src/trainer/unlearn/undial.py | 7 +++++-- src/trainer/utils.py | 13 +++++++------ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py index 45c1e4e6..379aa85a 100644 --- a/src/trainer/__init__.py +++ b/src/trainer/__init__.py @@ -89,4 +89,4 @@ def load_trainer( _register_trainer(DPO) _register_trainer(SimNPO) _register_trainer(RMU) -_register_trainer(UNDIAL) \ No newline at end of file +_register_trainer(UNDIAL) diff --git a/src/trainer/unlearn/undial.py b/src/trainer/unlearn/undial.py index a82101ff..e32147b3 100644 --- a/src/trainer/unlearn/undial.py +++ b/src/trainer/unlearn/undial.py @@ -1,6 +1,7 @@ from trainer.utils import compute_undial_loss from trainer.unlearn.grad_diff import GradDiff + class UNDIAL(GradDiff): def __init__(self, beta=1.0, *args, **kwargs): super().__init__(*args, **kwargs) @@ -10,7 +11,9 @@ def __init__(self, beta=1.0, *args, **kwargs): def compute_loss(self, model, inputs, return_outputs=False): forget_inputs = inputs["forget"] - forget_loss, forget_outputs = compute_undial_loss(model, self.ref_model, forget_inputs, self.beta) + forget_loss, forget_outputs = compute_undial_loss( + model, self.ref_model, forget_inputs, self.beta + ) retain_inputs = inputs["retain"] retain_inputs = { @@ -21,4 +24,4 @@ def compute_loss(self, model, inputs, return_outputs=False): retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) loss = self.gamma * forget_loss + self.alpha * retain_loss - return (loss, forget_outputs) if return_outputs else loss \ No newline at end of file + return (loss, forget_outputs) if return_outputs else loss diff --git a/src/trainer/utils.py b/src/trainer/utils.py index b496fb87..e1e8e865 100644 --- a/src/trainer/utils.py +++ b/src/trainer/utils.py @@ -67,33 +67,34 @@ def compute_dpo_loss(model, ref_model, win_inputs=None, lose_inputs=None, beta=1 loss = -2 / beta * F.logsigmoid(beta * (win_log_ratio - lose_log_ratio)).mean() return loss, (win_outputs, lose_outputs) + def compute_undial_loss(model, ref_model, inputs, beta): # Forward pass on the student (trainable) model outputs = model(**inputs) logits = outputs.logits labels = inputs["labels"] - + shift_labels = labels[..., 1:].contiguous() shift_logits = logits[..., :-1, :].contiguous() - + # Forward pass on the teacher model (no grad) with torch.no_grad(): teacher_logits = ref_model(**inputs).logits shift_teacher_logits = teacher_logits[..., :-1, :].contiguous() - + # Build the mask that identifies the tokens need to be unlearned mask = torch.zeros_like(shift_teacher_logits) batch_idx = torch.arange(mask.shape[0]).view(-1, 1, 1) seq_idx = torch.arange(mask.shape[1]).view(1, -1, 1) mask[batch_idx, seq_idx, shift_labels.unsqueeze(-1)] = 1.0 - + # Adjust teacher logits: subtract di_strength on the correct token pre_softmax = shift_teacher_logits - mask * beta soft_label = F.softmax(pre_softmax, dim=-1) - loss_fct = nn.CrossEntropyLoss(reduction='none') + loss_fct = nn.CrossEntropyLoss(reduction="none") loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), soft_label.view(-1, soft_label.size(-1)), ) - return loss.mean(), outputs \ No newline at end of file + return loss.mean(), outputs From 2d7c66cc8c6a7bff5e2e77f26e73d55006b77d4e Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 11 May 2025 18:44:30 +0000 Subject: [PATCH 13/15] fix config --- configs/trainer/UNDIAL.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/trainer/UNDIAL.yaml b/configs/trainer/UNDIAL.yaml index c7884121..5b2b0d30 100644 --- a/configs/trainer/UNDIAL.yaml +++ b/configs/trainer/UNDIAL.yaml @@ -1,8 +1,8 @@ +defaults: + - finetune + handler: UNDIAL # corresponds to the class defined in src/trainer/unlearn/grad_diff.py args: # HuggingFace TrainingArguments - per_device_train_batch_size: 2 - per_device_eval_batch_size: 16 - gradient_accumulation_steps: 8 learning_rate: 1e-4 num_train_epochs: 10 method_args: # Your own method-specific arguments From 716d7e9d020b18b23129436fa4c0baeb60869d72 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Thu, 22 May 2025 20:23:22 +0000 Subject: [PATCH 14/15] fix docs and script --- community/methods/UNDIAL/README.md | 6 +- community/methods/UNDIAL/run.sh | 125 ++++++++++++----------------- 2 files changed, 53 insertions(+), 78 deletions(-) diff --git a/community/methods/UNDIAL/README.md b/community/methods/UNDIAL/README.md index de906dd5..28c309d2 100644 --- a/community/methods/UNDIAL/README.md +++ b/community/methods/UNDIAL/README.md @@ -4,13 +4,13 @@ - Link​: https://arxiv.org/pdf/2402.10052 # Setup -- Hyperparameters: The original paper uses Llama-2 7B with LoRA to tune the model (rank=8, alpha=16) and learning rate of 1e-4. It's suggested to search the learning rate over [1e-5, 3e-4, 1e-4], epoch over [3,10], and use an effective batch size of 16 (batch_size * gradient_accumulation). The other important hyperparemeter is beta, the strength of penalty, which typically takes a number between [3,10,30]. If we change to other models, adjusting learning rate accordingly. +- Hyperparameters: The original paper uses Llama-2 7B with LoRA to tune the model (rank=8, alpha=16) and learning rate of 1e-4. It's suggested to search the learning rate over [1e-5, 3e-4, 1e-4], and use an effective batch size of 32 (batch_size * gradient_accumulation). The other important hyperparemeter is beta, the strength of penalty, which typically takes a number between [3,10,30]. If we change to other models, adjusting learning rate accordingly. - Computation Setup: All experiments are run on one A100. -- Other Details: The original paper does not use the retain set and aims to retain knowledge in all domains, not just on the retain set. So alpha is set to 0. Practionioners could search over the gemma to better retain the performance on the retain set. +- Other Details: The original paper does not use the retain set and aims to retain knowledge in all domains, not just on the retain set. So alpha is set to 0. Practionioners could search over the alpha or gamma to better retain the performance on the retain set. # Results -Run `run.sh` script. The results can be found in `leaderboard.md`. +Run `run.sh` script. # Citation @misc{dong2024undial, diff --git a/community/methods/UNDIAL/run.sh b/community/methods/UNDIAL/run.sh index a0b6815a..06caef2b 100644 --- a/community/methods/UNDIAL/run.sh +++ b/community/methods/UNDIAL/run.sh @@ -1,5 +1,11 @@ #!/bin/bash +export MASTER_PORT=$(python -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") +echo "Master Port: $MASTER_PORT" + +######################################################################################################################## +########################################### Unlearn TOFU models ######################################################## +######################################################################################################################## models=( "Llama-3.2-1B-Instruct" @@ -8,18 +14,18 @@ trainers_experiments=( "UNDIAL unlearn/tofu/default.yaml" ) forget_retain_splits=( - "forget05 retain95" "forget10 retain90" + "forget05 retain95" "forget01 retain99" ) -per_device_train_batch_size=4 # on two gpus would make effective batch size 32 -gradient_accumulation_steps=4 +per_device_train_batch_size=16 +gradient_accumulation_steps=2 -######################################################################################################################## -########################################### Unlearn TOFU models ######################################################## -######################################################################################################################## +lrs=(1e-5 1e-4 3e-4) +alphas=(1 2 5) +betas=(3 10 30) for split in "${forget_retain_splits[@]}"; do @@ -29,75 +35,44 @@ for split in "${forget_retain_splits[@]}"; do for trainer_experiment in "${trainers_experiments[@]}"; do trainer=$(echo $trainer_experiment | cut -d' ' -f1) experiment=$(echo $trainer_experiment | cut -d' ' -f2) - - task_name=tofu_${model}_${forget_split}_${trainer} - model_path=open-unlearning/tofu_${model}_full - echo ${task_name}: Unlearning ${model_path} using ${trainer} - - # Unlearn - python src/train.py --config-name=unlearn.yaml \ - experiment=${experiment} \ - trainer=${trainer} \ - task_name=${task_name} \ - model=${model} \ - forget_split=${forget_split} \ - retain_split=${retain_split} \ - model.model_args.pretrained_model_name_or_path=${model_path} \ - retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ - trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ - trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ - - # Eval - CUDA_VISIBLE_DEVICES=0 python src/eval.py \ - experiment=eval/tofu/default.yaml \ - forget_split=${forget_split} \ - model=${model} \ - task_name=${task_name} \ - model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ - paths.output_dir=saves/unlearn/${task_name}/evals \ - retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json + for lr in "${lrs[@]}"; do + for beta in "${betas[@]}"; do + for alpha in "${alphas[@]}"; do + task_name=tofu_${model}_${forget_split}_${trainer}_lr${lr}_beta${beta}_alpha${alpha} + model_path=open-unlearning/tofu_${model}_full + echo ${task_name}: Unlearning ${model_path} using ${trainer} + + # Unlearn + CUDA_VISIBLE_DEVICES=0 \ + python src/train.py --config-name=unlearn.yaml \ + experiment=${experiment} \ + trainer=${trainer} \ + task_name=${task_name} \ + model=${model} \ + forget_split=${forget_split} \ + retain_split=${retain_split} \ + model.model_args.pretrained_model_name_or_path=${model_path} \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ + trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ + trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ + trainer.args.eval_strategy=no \ + trainer.args.eval_on_start=False \ + trainer.args.learning_rate=$lr \ + trainer.method_args.beta=$beta \ + trainer.method_args.alpha=$alpha + + # Eval + CUDA_VISIBLE_DEVICES=0 python src/eval.py \ + experiment=eval/tofu/default.yaml \ + forget_split=${forget_split} \ + model=${model} \ + task_name=${task_name} \ + model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ + paths.output_dir=saves/unlearn/${task_name}/evals \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json + done + done + done done done done - -# ######################################################### -# #################### MUSE Unlearning #################### -# ######################################################### - - -model=Llama-3.2-1B-Instruct - -data_splits=( - "News" - "Books" -) - -trainers=( - "UNDIAL" -) - -for data_split in "${data_splits[@]}"; do - for trainer in "${trainers[@]}"; do - - task_name=muse_${model}_${data_split}_${trainer} - - python src/train.py --config-name=unlearn.yaml \ - experiment=unlearn/muse/default.yaml \ - model=${model} \ - data_split=${data_split} \ - trainer=${trainer} \ - task_name=${task_name} \ - retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json \ - trainer.args.per_device_train_batch_size=${per_device_train_batch_size} \ - trainer.args.gradient_accumulation_steps=${gradient_accumulation_steps} \ - - CUDA_VISIBLE_DEVICES=0 python src/eval.py \ - experiment=eval/muse/default.yaml \ - data_split=${data_split} \ - task_name=${task_name} \ - model=${model} \ - model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ - paths.output_dir=saves/unlearn/${trainer}/evals \ - retain_logs_path=saves/eval/muse_${model}_${data_split}_retrain/MUSE_EVAL.json - done -done From 0687b355048f91a9e401a6fd2a710846afc3c237 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Thu, 22 May 2025 20:34:43 +0000 Subject: [PATCH 15/15] Update readme --- README.md | 4 ++-- docs/links.md | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fc423e8b..135af95b 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ ## πŸ“– Overview -We provide efficient and streamlined implementations of the TOFU, MUSE and WMDP unlearning benchmarks while supporting 6 unlearning methods, 5+ datasets, 10+ evaluation metrics, and 7+ LLM architectures. Each of these can be easily extended to incorporate more variants. +We provide efficient and streamlined implementations of the TOFU, MUSE and WMDP unlearning benchmarks while supporting 7 unlearning methods, 5+ datasets, 10+ evaluation metrics, and 7+ LLM architectures. Each of these can be easily extended to incorporate more variants. We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field. @@ -62,7 +62,7 @@ We provide several variants for each of the components in the unlearning pipelin | **Component** | **Available Options** | |------------------------|----------------------| | **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/), [WMDP](https://www.wmdp.ai/) | -| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | +| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU, UNDIAL | | **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits), WMDP-Bio, WMDP-Cyber | | **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma, Zephyr | diff --git a/docs/links.md b/docs/links.md index c41f325e..b84977cf 100644 --- a/docs/links.md +++ b/docs/links.md @@ -25,6 +25,7 @@ Links to research papers and resources corresponding to implemented features in | SimNPO | Paper [πŸ“„](https://arxiv.org/abs/2410.07163), Code [πŸ™](https://github.com/OPTML-Group/Unlearn-Simple) | | IdkDPO | TOFU ([πŸ“„](https://arxiv.org/abs/2401.06121)) | | RMU | WMDP paper ([πŸ™](https://github.com/centerforaisafety/wmdp/tree/main/rmu), [🌐](https://www.wmdp.ai/)), later used in G-effect ([πŸ™](https://github.com/tmlr-group/G-effect/blob/main/dataloader.py)) | +| UNDIAL | Paper [πŸ“„](https://arxiv.org/pdf/2402.10052), Code [πŸ™](https://github.com/dong-river/LLM_unlearning/tree/main) | ---