diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml index f9cd2a44..eb30cc28 100644 --- a/.github/workflows/code_checks.yml +++ b/.github/workflows/code_checks.yml @@ -29,10 +29,10 @@ jobs: run-code-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6.0.2 + - uses: actions/checkout@v7.0.0 - name: Install uv - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b + uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 with: # Install a specific version of uv. version: "0.5.21" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 013f410c..6f9fa896 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -42,10 +42,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v6.0.2 + uses: actions/checkout@v7.0.0 - name: Install uv - uses: astral-sh/setup-uv@v8.1.0 + uses: astral-sh/setup-uv@v8.2.0 with: version: "0.5.21" enable-cache: true @@ -77,7 +77,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v6.0.2 + uses: actions/checkout@v7.0.0 - name: Configure Git Credentials run: | diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index e31d69ed..e108d385 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -41,10 +41,10 @@ jobs: integration-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6.0.2 + - uses: actions/checkout@v7.0.0 - name: Install uv - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b + uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 with: # Install a specific version of uv. version: "0.5.21" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 038d39d6..4f0b855f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -16,10 +16,10 @@ jobs: run: | sudo apt-get update sudo apt-get install libcurl4-openssl-dev libssl-dev - - uses: actions/checkout@v6.0.2 + - uses: actions/checkout@v7.0.0 - name: Install uv - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b + uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 with: # Install a specific version of uv. version: "0.5.21" diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index fb67d417..37c65ed4 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -41,10 +41,10 @@ jobs: unit-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6.0.2 + - uses: actions/checkout@v7.0.0 - name: Install uv - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b + uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 with: # Install a specific version of uv. version: "0.5.21" diff --git a/examples/gan/README.md b/examples/gan/README.md index 9764e423..8822a748 100644 --- a/examples/gan/README.md +++ b/examples/gan/README.md @@ -94,7 +94,7 @@ Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix and Mutual Information Difference. To compute those metrics, you can run the command below. The name of the table should be -defined in the `dataset_meta.json` file, and the file for synthetic data should be under +defined in the `dataset_meta.json` file, and the data files should be under `/data/{table_name}.csv` for the real data and `/results/{table_name}_synthetic.csv` for the synthetic data. diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md index 8b3295c2..0a3ad963 100644 --- a/examples/gan/ensemble_attack/README.md +++ b/examples/gan/ensemble_attack/README.md @@ -60,10 +60,6 @@ python -m examples.gan.ensemble_attack.make_challenge_dataset ## 4. Training the attack model -> [!NOTE] -> In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_training.model_name` -> is what determines this attack will be run with the CTGAN model. - To train the attack models, execute the following command: ```bash @@ -81,7 +77,7 @@ To test the attack model against the target model and synthetic data produced on [step 2](#2-generating-target-synthetic-data-to-be-tested), please run: ```bash -python -m examples.gan.ensemble_attack.test_attack_model +python -m examples.gan.ensemble_attack.run_test_attack_model ``` ## 6. Compute the attack success diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml index 55b083a6..b123f629 100644 --- a/examples/gan/ensemble_attack/config.yaml +++ b/examples/gan/ensemble_attack/config.yaml @@ -36,7 +36,6 @@ ensemble_attack: run_metaclassifier_training: true shadow_training: - model_name: ctgan model_config: # Configurations specific for the CTGAN model training: epochs: 300 diff --git a/examples/gan/ensemble_attack/make_challenge_dataset.py b/examples/gan/ensemble_attack/make_challenge_dataset.py index 1ab30ea9..8eb7c09c 100644 --- a/examples/gan/ensemble_attack/make_challenge_dataset.py +++ b/examples/gan/ensemble_attack/make_challenge_dataset.py @@ -39,6 +39,8 @@ def make_challenge_dataset(config: DictConfig) -> None: log(INFO, f"Saving challenge labels to {challenge_label_path}") np.save(challenge_label_path, challenge_data_labels) + log(INFO, "Done!") + if __name__ == "__main__": make_challenge_dataset() diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/run_test_attack_model.py similarity index 100% rename from examples/gan/ensemble_attack/test_attack_model.py rename to examples/gan/ensemble_attack/run_test_attack_model.py diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py index e650db3d..83b2b6fc 100644 --- a/examples/gan/ensemble_attack/utils.py +++ b/examples/gan/ensemble_attack/utils.py @@ -40,6 +40,14 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]: Returns: The ensemble attack training config for the CTGAN model. """ + base_data_dir = str + if "base_data_dir" in config: + base_data_dir = config.base_data_dir + elif "data_dir" in config: + base_data_dir = config.data_dir + else: + raise ValueError("Either base_data_dir or data_dir must be provided in the config.") + # Saving the model config from the config.yaml into a json file # because that's what the ensemble attack code will be looking for training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path) @@ -48,10 +56,10 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]: training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config, resolve=True) assert isinstance(training_config, dict), "Training config must be a dictionary." training_config["general"] = { - "test_data_dir": config.base_data_dir, + "test_data_dir": base_data_dir, "sample_prefix": "ctgan", - "data_dir": config.base_data_dir, - "workspace_dir": str(Path(config.base_data_dir) / "shadow_workspace"), + "data_dir": base_data_dir, + "workspace_dir": str(Path(base_data_dir) / "shadow_workspace"), "exp_name": "pre_trained_model", } json.dump(training_config, f) diff --git a/examples/tabsyn/README.md b/examples/tabsyn/README.md new file mode 100644 index 00000000..53c3e8cf --- /dev/null +++ b/examples/tabsyn/README.md @@ -0,0 +1,92 @@ +# TabSyn Single Table Example + +This example will go over training a single-table [TabSyn](https://arxiv.org/abs/2310.09656) +model and synthesizing data afterwards. + + +## Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/1HTgfgeL5GXc8uAGfeQirJrUynK7vFeyb/view?usp=drive_link), +extract the files and place them in a `/data` folder in within this folder +(`examples/tabsyn`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +Here is a description of the files that have been extracted: +- `trans.csv`: The training data. It consists of information about bank transactions and it +contains 20,000 data points. +- `trans_info.json`: Metadata about the `trans.csv` data, with information such as which columns are +numerical and which are categorical, what is the task type, etc. + + +## Kicking off training + +To kick off training, simply run the command below from the project's root folder: + +```bash +python -m examples.tabsyn.train +``` + + +## Training results + +The result files will be saved inside a `/results` folder within this folder +(`examples/tabsyn`). + +> [!NOTE] +> If you wish to change the save folder, you can do so by editing the `results_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +In the `/results/trans` folder, there will be a file called `model.pt`, +which is a pytorch saved model. + + +## Synthesizing data + +To synthesize some data with the trained model, run: + +```bash +python -m examples.tabsyn.synthesize +``` + +If there is already a trained model in the `/results` folder, it will use that model. +Otherwise it will train one from scratch. At the end of the script, it will save the +synthesized data to `/results/trans/synthetic_data/trans_synthetic.csv`. + + +## Evaluating the quality of the synthetic data + +### Alpha Precision + +To run a round of evaluation with [Alpha Precision](https://arxiv.org/abs/2301.07573) +metrics on a set of synthetic data, run the `evaluate.py` script: + +```bash +python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \ + --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \ + --real_data examples/tabsyn/data/trans_sampled.csv \ + --meta_info_path examples/gan/data/meta_info.json \ + --save_directory examples/gan/results/ +``` + +It will save the evaluation results under the `/results/model.txt` file. + +### Additional Metrics + +The calculation of additional metrics are set up in the `evaluate.py` file. They are the +Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix Difference +and Mutual Information Difference. + +To compute those metrics, you can run the command below. The data files should +be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_samples.csv` +for the sampleed data used for training, and `/results/{table_name}_synthetic.csv` +for the synthetic data. + +```bash +python -m examples.tabsyn.evaluate +``` + +The results will be saved in the `/results/evaluation.json` file. diff --git a/examples/tabsyn/config.yaml b/examples/tabsyn/config.yaml new file mode 100644 index 00000000..d15bd606 --- /dev/null +++ b/examples/tabsyn/config.yaml @@ -0,0 +1,85 @@ +data_dir: examples/tabsyn/data +results_dir: examples/tabsyn/results +tabsyn_config: examples/tabsyn/tabsyn_config.toml +table_name: trans + +training: + sample_size: 20000 + +evaluation: + # Reporting Configuration + write_report: True + metric_report_path: ${results_dir}/evaluation_results.txt + + # Privacy Evaluation Configurations + dcr: + run: True + norm: "l2" + batch_size: 1000 + + median_dcr: + run: True + norm: "l2" + batch_size: 1000 + + hitting_rate: + run: True + hitting_threshold: 0.03 + + eir: + run: True + norm: "gower" + + nndr: + run: True + norm: "l2" + batch_size: 1000 + + # Quality Evaluation Configurations + ks_tv: + run: True + significance_level: 0.05 + permutations: 1000 + + alpha_precision: + run: True + naive_only: False + + ci_overlap: + run: True + confidence_level: 95 + + correlation_diff: + run: True + compute_mixed_correlations: True + + mean_diff: + run: True + + f1_score_diff: + run: True + label_column: "trans_type" + folds: 5 + f1_type: "macro" + + regression_score_diff: + run: True + label_column: "trans_type" + preprocess_labels: True + measure_metrics_in_original_label_space: False + verbose: True + convert_label_to_float: True + + hellinger: + run: True + include_numerical_columns: True + + propensity_mse: + run: True + folds: 5 + max_iterations: 50 + solver: "liblinear" + + mutual_information: + run: True + include_numerical_columns: False diff --git a/examples/tabsyn/ensemble_attack/README.md b/examples/tabsyn/ensemble_attack/README.md new file mode 100644 index 00000000..00054d2f --- /dev/null +++ b/examples/tabsyn/ensemble_attack/README.md @@ -0,0 +1,95 @@ +# TabSyn Ensemble Attack Example + +On this example, we demonstrate how to run the [Ensemble Attack](../../ensemble_attack/README.md) +using the [TabSyn](https://arxiv.org/abs/2310.09656) model. + +## 1. Downloading data + +First, we need the data. Download it from this +[Google Drive link](https://drive.google.com/file/d/16XCa63eD2dZ1bddhgRbGFuzAuuMlto9P/view?usp=sharing), +extract the files and place them in a `/data/ensemble_attack` folder within this folder +(`examples/tabsyn`). + +> [!NOTE] +> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute +> of the [`config.yaml`](config.yaml) file. + +Here is a description of the files that have been extracted: +- `trans.csv`: The full set of training data. +- `dataset_meta.json`: Metadata about the relationship between the tables in the dataset. Since this is a +single table dataset, it will only contain information about the transaction (`trans`) table. +- `trans_domain.json`: Metadata about the columns of the transaction table, such as their size +and type (`continuous` or `discrete`). +- `trans_info.json` and `meta_info.json`: Metadata about the `trans.csv` data, with information +such as which columns are numerical and which are categorical, what is the task type, etc. +- `data_types.json`: Additional metadata about the columns, splitting them into 4 types: + - `numerical`: a list of the columns that contain numerical information + - `categorical`: a list of the columns that contain categorical information + - `variable_to_predict`: the name of the target column that will be predicted + - `id_column_name`: the name of the column in the table that represents the rows' id. + +With the data present in the correct folder, we can proceed with running the attack. + +## 2. Generating target synthetic data to be tested + +The **target model** is the model being attacked, and the **target synthetic data** +is the synthetic data generated by the target model that will be evaluated against +the attack. + +If you already have a set of synthetic data produced by a target model, +you can add its path to the `ensemble_attack.target_model.target_synthetic_data_path` +property in the [`config.yaml`](config.yaml) file and skip this step. + +If you wish to train a new target model and produce the synthetic data that will be the +target of the attack, you can run: + +```bash +python -m examples.tabsyn.synthesize --config-path=./ensemble_attack +``` + +## 3. Producing the challenge points dataset + +The challenge points dataset is composed of real data points where half of them +were used in training the target model and half weren't. It is the dataset we are going +to use to evaluate how good the attack model is in differentiating between +the points used in training and the ones not used in training. + +To produce such dataset, run the following script: + +```bash +python -m examples.tabsyn.ensemble_attack.make_challenge_dataset +``` + +## 4. Training the attack model + +To train the attack models, execute the following command: + +```bash +python -m examples.tabsyn.ensemble_attack.train_attack_model +``` + +This will take a long time to run, so it might be a good idea to execute it as a +background process. If you want to have a quick test run before kicking off the +full process, you can change the number of iterations, epochs, population and +sample sizes to smaller numbers. + +## 5. Testing the attack model + +To test the attack model against the target model and synthetic data produced on +[step 2](#2-generating-target-synthetic-data-to-be-tested), please run: + +```bash +python -m examples.tabsyn.ensemble_attack.run_test_attack_model +``` + +## 6. Compute the attack success + +To compute the metrics about the success of the attack against the target +synthetic data, you can run the following command: + +```bash +python -m examples.tabsyn.ensemble_attack.compute_attack_success +``` + +The results will both printed on the console and saved in the file +`examples/tabsyn/results/attack_success_for_xgb_metaclassifier_model.txt` diff --git a/examples/tabsyn/ensemble_attack/compute_attack_success.py b/examples/tabsyn/ensemble_attack/compute_attack_success.py new file mode 100644 index 00000000..c8c09f99 --- /dev/null +++ b/examples/tabsyn/ensemble_attack/compute_attack_success.py @@ -0,0 +1,30 @@ +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from examples.ensemble_attack.compute_attack_success import compute_attack_success_for_given_targets +from midst_toolkit.common.logger import log + + +@hydra.main(config_path="./", config_name="config", version_base=None) +def compute_attack_success(config: DictConfig) -> None: + """Main function to compute the attack success.""" + log( + INFO, + f"Computing attack success for target synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...", + ) + + compute_attack_success_for_given_targets( + target_model_config=config.ensemble_attack.target_model, + # TODO: refactor this to work better outside of the challenge context (i.e. no target ID) + # No target ID needed for CTGAN, but it needs at least one element in this array. The value does not matter. + target_ids=[0], + experiment_directory=Path(config.results_dir), + metaclassifier_model_name=config.ensemble_attack.metaclassifier.meta_classifier_model_name, + ) + + +if __name__ == "__main__": + compute_attack_success() diff --git a/examples/tabsyn/ensemble_attack/config.yaml b/examples/tabsyn/ensemble_attack/config.yaml new file mode 100644 index 00000000..93613222 --- /dev/null +++ b/examples/tabsyn/ensemble_attack/config.yaml @@ -0,0 +1,159 @@ +# Training example configuration +# Base data directory (can be overridden from command line) +data_dir: examples/tabsyn/ensemble_attack/data +results_dir: examples/tabsyn/ensemble_attack/results +tabsyn_config: examples/tabsyn/tabsyn_config.toml +table_name: trans +data_file_name: ${table_name}.csv + +training: + sample_size: 20000 # number of rows to sample from the real data for training + +ensemble_attack: + random_seed: null # Set this to a value if you want to set a random seed for reproducibility + table_id_column_name: "trans_id" + data_file_name: ${data_file_name} + + data_paths: + processed_attack_data_path: ${data_dir} + population_path: ${data_dir} # This is the population data that the attacker has collected or has access to. + attack_evaluation_result_path: ${results_dir}/evaluation_results # Path where the attack evaluation results will be stored + + data_processing_config: + column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. + population_sample_size: 40000 # Population size is the total data that your attack has access to. + + pipeline: + # TODO: properly test these + run_data_processing: true # Set this to false if you have already saved the processed data + run_shadow_model_training: true # Set this to false if shadow models are already trained and saved + run_metaclassifier_training: true + + shadow_training: + model_name: ctgan + model_config: # Configurations specific for the CTGAN model + training: + epochs: 300 + verbose: True + synthesizing: + sample_size: 20000 + shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models + target_model_output_path: ${results_dir}/shadow_target_model_and_data + training_json_config_paths: # Config json files used for tabddpm training on the trans table + table_domain_file_path: ${data_dir}/trans_domain.json + dataset_meta_file_path: ${data_dir}/dataset_meta.json + training_config_path: ${data_dir}/trans.json # if this is not present, it will be created by copying the example config + fine_tuning_config: + fine_tune_diffusion_iterations: 200000 + fine_tune_classifier_iterations: 20000 + pre_train_data_size: 60000 + number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. + + final_shadow_models_path: [ + "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${ensemble_attack.shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", + ] + target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv + + # Metaclassifier settings + metaclassifier: + # Data types json file is used for xgboost model training. + data_types_file_path: ${data_dir}/data_types.json + model_type: "xgb" + # Model training parameters + num_optuna_trials: 100 # Original code: 100 + num_kfolds: 5 + use_gpu: false + # Temporary. Might remove having an epoch parameter. + epochs: 1 + meta_classifier_model_name: ${ensemble_attack.metaclassifier.model_type}_metaclassifier_model + metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved + + target_model: # This is only used for testing the attack on a real target model. + target_synthetic_data_path: ${results_dir}/${table_name}/synthetic_data/${table_name}_synthetic.csv + challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/${table_name}_challenge_data.csv + challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/${table_name}_challenge_labels.npy + + target_shadow_models_output_path: ${results_dir}/test_all_targets # Sub-directory to store test shadows and results + attack_probabilities_result_path: ${results_dir}/test_probabilities + attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase. + # See select_challenge_data_for_training()'s docstring for more details. + +evaluation: + # Reporting Configuration + write_report: True + metric_report_path: ${results_dir}/evaluation_results.txt + + # Privacy Evaluation Configurations + dcr: + run: True + norm: "l2" + batch_size: 1000 + + median_dcr: + run: True + norm: "l2" + batch_size: 1000 + + hitting_rate: + run: True + hitting_threshold: 0.03 + + eir: + run: True + norm: "gower" + + nndr: + run: True + norm: "l2" + batch_size: 1000 + + # Quality Evaluation Configurations + ks_tv: + run: True + significance_level: 0.05 + permutations: 1000 + + alpha_precision: + run: True + naive_only: False + + ci_overlap: + run: True + confidence_level: 95 + + correlation_diff: + run: True + compute_mixed_correlations: True + + mean_diff: + run: True + + f1_score_diff: + run: True + label_column: "trans_type" + folds: 5 + f1_type: "macro" + + regression_score_diff: + run: True + label_column: "trans_type" + preprocess_labels: True + measure_metrics_in_original_label_space: False + verbose: True + convert_label_to_float: True + + hellinger: + run: True + include_numerical_columns: True + + propensity_mse: + run: True + folds: 5 + max_iterations: 50 + solver: "liblinear" + + mutual_information: + run: True + include_numerical_columns: False diff --git a/examples/tabsyn/ensemble_attack/make_challenge_dataset.py b/examples/tabsyn/ensemble_attack/make_challenge_dataset.py new file mode 100644 index 00000000..987ba704 --- /dev/null +++ b/examples/tabsyn/ensemble_attack/make_challenge_dataset.py @@ -0,0 +1,40 @@ +from logging import INFO +from pathlib import Path + +import hydra +import numpy as np +import pandas as pd +from omegaconf import DictConfig + +from midst_toolkit.common.logger import log + + +@hydra.main(config_path="./", config_name="config", version_base=None) +def make_challenge_dataset(config: DictConfig) -> None: + """Main function to make the challenge dataset.""" + log(INFO, "Making challenge dataset...") + + real_data = pd.read_csv(Path(config.data_dir) / f"{config.table_name}.csv") + + training_data = pd.read_csv(Path(config.results_dir) / f"{config.table_name}_sampled.csv") + id_column = f"{config.table_name}_id" + untrained_data = real_data[~real_data[id_column].isin(training_data[id_column])].sample(len(training_data)) + + challenge_data = pd.concat([training_data, untrained_data]) + challenge_data_labels = np.concatenate([np.ones(len(training_data)), np.zeros(len(untrained_data))]) + + processed_attack_data_path = Path(config.ensemble_attack.data_paths.processed_attack_data_path) + processed_attack_data_path.mkdir(parents=True, exist_ok=True) + + challenge_data_path = processed_attack_data_path / f"{config.table_name}_challenge_data.csv" + challenge_label_path = processed_attack_data_path / f"{config.table_name}_challenge_labels.npy" + log(INFO, f"Saving challenge data to {challenge_data_path}") + challenge_data.to_csv(challenge_data_path, index=False) + log(INFO, f"Saving challenge labels to {challenge_label_path}") + np.save(challenge_label_path, challenge_data_labels) + + log(INFO, "Done!") + + +if __name__ == "__main__": + make_challenge_dataset() diff --git a/examples/tabsyn/ensemble_attack/run_test_attack_model.py b/examples/tabsyn/ensemble_attack/run_test_attack_model.py new file mode 100644 index 00000000..50b52fb1 --- /dev/null +++ b/examples/tabsyn/ensemble_attack/run_test_attack_model.py @@ -0,0 +1,32 @@ +from logging import INFO + +import hydra +from omegaconf import DictConfig + +from examples.ensemble_attack.test_attack_model import run_metaclassifier_testing +from examples.gan.ensemble_attack.utils import make_training_config +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabSynModelRunner +from midst_toolkit.common.logger import log + + +@hydra.main(config_path="./", config_name="config", version_base=None) +def attack_model_test(config: DictConfig) -> None: + """ + Main function to test the attack model. + + Args: + config: The configuration object from the config.yaml file. + """ + log( + INFO, + f"Testing attack model against synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...", + ) + + make_training_config(config) + model_runner = EnsembleAttackTabSynModelRunner(config) + + run_metaclassifier_testing(model_runner, config.ensemble_attack) + + +if __name__ == "__main__": + attack_model_test() diff --git a/examples/tabsyn/ensemble_attack/train_attack_model.py b/examples/tabsyn/ensemble_attack/train_attack_model.py new file mode 100644 index 00000000..8914858d --- /dev/null +++ b/examples/tabsyn/ensemble_attack/train_attack_model.py @@ -0,0 +1,88 @@ +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training +from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training +from examples.gan.ensemble_attack.utils import get_master_challenge_train_data +from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe +from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabSynModelRunner +from midst_toolkit.attacks.ensemble.process_split_data import process_split_data +from midst_toolkit.common.logger import log +from midst_toolkit.common.random import set_all_random_seeds + + +@hydra.main(config_path="./", config_name="config", version_base=None) +def train_attack_model(config: DictConfig) -> None: + """ + Train the Ensemble Attack pipeline with CTGAN model. + + As the first step, data processing is done. + Second step is shadow model training used for RMIA attack. + Third step is metaclassifier training and evaluation. + + Args: + config: Attack configuration as an OmegaConf DictConfig object. + """ + if config.ensemble_attack.random_seed is not None: + set_all_random_seeds(seed=config.ensemble_attack.random_seed) + log(INFO, f"Training phase random seed set to {config.ensemble_attack.random_seed}.") + + if config.ensemble_attack.pipeline.run_data_processing: + log(INFO, "Running data processing pipeline...") + # The following function saves the required dataframe splits in the specified processed_attack_data_path path. + population_data = load_dataframe( + Path(config.ensemble_attack.data_paths.population_path), + config.data_file_name, + ) + + # Removing id columns and saving the dataset + id_columns = [c for c in population_data.columns if c.endswith("_id")] + population_data_no_id = population_data.drop(columns=id_columns) + save_dataframe( + population_data_no_id, + Path(config.ensemble_attack.data_paths.population_path), + f"{Path(config.data_file_name).stem}_no_id.csv", + ) + + process_split_data( + all_population_data=population_data, + processed_attack_data_path=Path(config.ensemble_attack.data_paths.processed_attack_data_path), + # TODO: column_to_stratify value is not documented in the original codebase. + column_to_stratify=config.ensemble_attack.data_processing_config.column_to_stratify, + num_total_samples=config.ensemble_attack.data_processing_config.population_sample_size, + random_seed=config.ensemble_attack.random_seed, + ) + + if config.ensemble_attack.pipeline.run_shadow_model_training: + log(INFO, "Training the shadow models...") + + model_runner = EnsembleAttackTabSynModelRunner(config) + + master_challenge_train = get_master_challenge_train_data(config) + shadow_data_paths = run_shadow_model_training(model_runner, config.ensemble_attack, master_challenge_train) + + log(INFO, "Training the target model...") + target_model_synthetic_path = run_target_model_training(model_runner, config.ensemble_attack) + + if config.ensemble_attack.pipeline.run_metaclassifier_training: + log(INFO, "Training the metaclassifier...") + if not config.ensemble_attack.pipeline.run_shadow_model_training: + # If shadow model training is skipped, we need to provide the previous shadow model and target model paths. + shadow_data_paths = [ + Path(path) for path in config.ensemble_attack.shadow_training.final_shadow_models_path + ] + target_model_synthetic_path = Path(config.ensemble_attack.shadow_training.target_synthetic_data_path) + + assert len(shadow_data_paths) == 3, "The attack_data_paths list must contain exactly three elements." + assert target_model_synthetic_path is not None, ( + "The target_data_path must be provided for metaclassifier training." + ) + + run_metaclassifier_training(config.ensemble_attack, shadow_data_paths, target_model_synthetic_path) + + +if __name__ == "__main__": + train_attack_model() diff --git a/examples/tabsyn/evaluate.py b/examples/tabsyn/evaluate.py new file mode 100644 index 00000000..5853bf74 --- /dev/null +++ b/examples/tabsyn/evaluate.py @@ -0,0 +1,449 @@ +from logging import INFO +from pathlib import Path +from typing import Any + +import hydra +import pandas as pd +from omegaconf import DictConfig + +from examples.midst_evaluation.preprocessing import ( + get_numerical_and_categorical_column_names, + preprocess_data_for_alpha_precision_eval, + syntheval_preprocess, +) +from midst_toolkit.common.logger import log +from midst_toolkit.data_processing.midst_data_processing import load_midst_data_with_test +from midst_toolkit.evaluation.metrics_base import MetricBase +from midst_toolkit.evaluation.privacy import ( + DistanceToClosestRecordScore, + EpsilonIdentifiabilityRisk, + HittingRate, + MedianDistanceToClosestRecordScore, + NearestNeighborDistanceRatio, +) +from midst_toolkit.evaluation.privacy.distance_preprocess import preprocess_for_distance_computation +from midst_toolkit.evaluation.privacy.distance_utils import NormType +from midst_toolkit.evaluation.privacy.epsilon_identifiability_risk import EpsilonIdentifiabilityNorm +from midst_toolkit.evaluation.quality import ( + AlphaPrecision, + CorrelationMatrixDifference, + DimensionwiseMeanDifference, + KolmogorovSmirnovAndTotalVariation, + MeanConfidenceIntervalOverlap, + MeanF1ScoreDifference, + MeanHellingerDistance, + MeanPropensityMeanSquaredError, + MeanRegressionDifference, + MutualInformationDifference, +) +from midst_toolkit.evaluation.quality.confidence_interval_overlap import ConfidenceLevel + + +SEPARATOR = "-" * 80 + + +def log_metrics(header: str, results: dict[str, float]) -> None: + """ + Helper function to log metrics associated with the results dictionary in a structured fashion. The header + is used to separate out different families of metrics in the output. + + Args: + header: String to describe the set of metrics that will be logged. + results: Dictionary of metric names (keys) and metric values (values) to be logged. + """ + log(INFO, f"\n{header}\n{SEPARATOR}\n") + for metric_name, metric_value in results.items(): + log(INFO, rf"Metric: {metric_name}\Score: {metric_value}") + log(INFO, f"{SEPARATOR}\n") + + +def write_metrics(metric_report_path: Path, header: str, results: dict[str, float]) -> None: + """ + Helper function to write metrics associated with the results dictionary in a structured fashion to a file. The + header is used to separate out different families of metrics in the output. + + Args: + metric_report_path: Path to with the metrics will be written + header: String to describe the set of metrics that will be written. + results: Dictionary of metric names (keys) and metric values (values) to be written. + """ + with open(metric_report_path, "a") as f: + f.write(f"\n{header}\n{SEPARATOR}\n") + for metric_name, metric_value in results.items(): + f.write(f"Metric: {metric_name:40}Metric: {metric_value}\n") + f.write(f"{SEPARATOR}\n") + + +def report_metrics(cfg: DictConfig, header: str, results: dict[str, float]) -> None: + """ + A helper function facilitate both logging and, optionally depending on the configuration settings, write the + metrics results to a file. + + Args: + cfg: Configuration determining if and where to write metrics to a file. + header: String to describe the set of metrics that will be logged/written. + results: Dictionary of metric names (keys) and metric values (values) to be logged/written. + """ + log_metrics(header, results) + write_metrics(Path(cfg.metric_report_path), header, results) + + +def remove_label_column_from_other_columns( + label_column: str, numerical_columns: list[str], categorical_columns: list[str] +) -> tuple[list[str], list[str]]: + """ + Given a column name for a target label (task label), this ensures that the label is removed from either the + numerical or categorical columns list. During preprocessing, it is advantageous to also process the label + column into, for instance, and ordinal value. However, when performing F1 measurements, we no longer want it + to be part of the dataframe during training. + + Args: + label_column: Column name associated with task labels of interest. + numerical_columns: Set of column names associated with numerical values. + categorical_columns: Set of column names associated with categorical values. + + Raises: + ValueError: Will throw an error if the label column is present in both column names lists, which is bad... + + Returns: + Filtered copies of the numerical and categorical column names without the specified label column included. + """ + if label_column in numerical_columns and label_column in categorical_columns: + raise ValueError("Label column appears in both types of columns...") + return [item for item in numerical_columns if label_column != item], [ + item for item in categorical_columns if label_column != item + ] + + +def should_syntheval_preprocess(cfg: DictConfig, for_privacy: bool) -> bool: + """ + Determines whether, based on the configuration, syntheval preprocessing needs to be performed. + + Args: + cfg: Configuration with the settings for the entire evaluation pipeline. + for_privacy: Whether this is for the privacy evaluations (True) or for the quality evaluations (False). + + Returns: + True if preprocessing with the SynthEval pipeline should be performed, False otherwise + """ + if for_privacy: + return any([cfg.hitting_rate.run, cfg.eir.run]) + return any( + [ + cfg.ks_tv.run, + cfg.ci_overlap.run, + cfg.ks_tv.run, + cfg.correlation_diff.run, + cfg.mean_diff.run, + cfg.f1_score_diff.run, + cfg.regression_score_diff.run, + cfg.hellinger.run, + cfg.propensity_mse.run, + cfg.mutual_information.run, + ] + ) + + +def run_quality_evaluations( + # ruff: noqa: PLR0915 + cfg: DictConfig, + real_data_train: pd.DataFrame, + synthetic_data: pd.DataFrame, + real_data_test: pd.DataFrame, + meta_info: dict[str, Any], +) -> None: + """ + Run quality evaluation metrics. + + Args: + cfg: Configuration information for the evaluation. + real_data_train: Dataframe containing real data to which the synthetic data will be compared (typically used to + train the model that generation the synthetic data). + synthetic_data: Dataframe containing the synthetic data whose quality is to be measured. + real_data_test: Dataframe containing real data to which the synthetic data will be compared (typically + explicitly NOT USED to train the model that generation the synthetic data). + meta_info: Dictionary containing information about the dataframes, including which columns correspond to + numerical and categorical values or whether there is a column corresponding to a label. + """ + metric: MetricBase + if cfg.alpha_precision.run: + log(INFO, "Preprocessing Data for Alpha Precision Evaluation") + # Categorical values are one-hot encoded, numerical values are left alone. + alpha_precision_real_data, alpha_precision_synthetic_data = preprocess_data_for_alpha_precision_eval( + real_data=real_data_train, synthetic_data=synthetic_data, meta_info=meta_info + ) + log(INFO, "Running Alpha-Precision Evaluation") + metric = AlphaPrecision(naive_only=cfg.alpha_precision.naive_only) + results = metric.compute(alpha_precision_real_data, alpha_precision_synthetic_data) + report_metrics(cfg, "ALPHA PRECISION", results) + + # Shared preprocessing for syntheval based metrics if they are to be run + if should_syntheval_preprocess(cfg, for_privacy=False): + log(INFO, "Preprocessing Data with SynthEval pipeline") + numerical_columns, categorical_columns = get_numerical_and_categorical_column_names(real_data_train, meta_info) + + # Categorical values are ordinal encoded, numerical values are min-max scaled + syntheval_real_data_train, syntheval_synthetic_data, syntheval_real_data_test = syntheval_preprocess( + numerical_columns, categorical_columns, real_data_train, synthetic_data, real_data_test + ) + + if cfg.ks_tv.run: + log(INFO, "Running Kolmogorov-Smirnov and Total Variation Evaluation") + metric = KolmogorovSmirnovAndTotalVariation( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + significance_level=cfg.ks_tv.significance_level, + permutations=cfg.ks_tv.permutations, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "KOLMOGOROV SMIRNOV AND TOTAL VARIATION", results) + + if cfg.ci_overlap.run: + log(INFO, "Running Confidence Interval Overlap Evaluation") + metric = MeanConfidenceIntervalOverlap( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + confidence_level=ConfidenceLevel(cfg.ci_overlap.confidence_level), + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "CONFIDENCE INTERVAL OVERLAP", results) + + if cfg.correlation_diff.run: + log(INFO, "Running Mean Correlation Matrix Difference Evaluation") + metric = CorrelationMatrixDifference( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + compute_mixed_correlations=cfg.correlation_diff.compute_mixed_correlations, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "CORRELATION MATRIX DIFFERENCE", results) + + if cfg.mean_diff.run: + log(INFO, "Running Dimensionwise Mean Difference Evaluation") + metric = DimensionwiseMeanDifference( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "DIMENSIONWISE MEAN DIFFERENCE", results) + + if cfg.f1_score_diff.run: + # Explicitly removing the target/label column from other column names + label_column = cfg.f1_score_diff.label_column + filtered_numerical_columns, filtered_categorical_columns = remove_label_column_from_other_columns( + label_column, numerical_columns, categorical_columns + ) + log(INFO, "Running F1 Score Difference Evaluation") + metric = MeanF1ScoreDifference( + categorical_columns=filtered_categorical_columns, + numerical_columns=filtered_numerical_columns, + label_column=label_column, + folds=cfg.f1_score_diff.folds, + f1_type=cfg.f1_score_diff.f1_type, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data, syntheval_real_data_test) + report_metrics(cfg, "F1 SCORE DIFFERENCE", results) + + if cfg.regression_score_diff.run: + # Explicitly removing the target/label column from other column names + label_column = cfg.regression_score_diff.label_column + filtered_numerical_columns, filtered_categorical_columns = remove_label_column_from_other_columns( + label_column, numerical_columns, categorical_columns + ) + log(INFO, "Running Regression Score Difference Evaluation") + metric = MeanRegressionDifference( + categorical_columns=filtered_categorical_columns, + numerical_columns=filtered_numerical_columns, + label_column=label_column, + preprocess_labels=cfg.regression_score_diff.preprocess_labels, + include_additional_metrics=cfg.regression_score_diff.verbose, + # Regression has it's own preprocessing pipeline + do_preprocess=True, + measure_metrics_in_original_label_space=cfg.regression_score_diff.measure_metrics_in_original_label_space, + ) + + real_data_train_copy = real_data_train.copy() + synthetic_data_copy = synthetic_data.copy() + real_data_test_copy = real_data_test.copy() + if cfg.regression_score_diff.convert_label_to_float: + real_data_train_copy[label_column] = real_data_train_copy[label_column].astype(float) + synthetic_data_copy[label_column] = synthetic_data_copy[label_column].astype(float) + real_data_test_copy[label_column] = real_data_test_copy[label_column].astype(float) + + results = metric.compute(real_data_train_copy, synthetic_data_copy, real_data_test_copy) + report_metrics(cfg, "REGRESSION SCORE DIFFERENCE", results) + + if cfg.hellinger.run: + log(INFO, "Running Hellinger Distance Difference Evaluation") + metric = MeanHellingerDistance( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + include_numerical_columns=cfg.hellinger.include_numerical_columns, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "HELLINGER DISTANCE DIFFERENCE", results) + + if cfg.propensity_mse.run: + log(INFO, "Running Propensity Mean Squared Error Evaluation") + metric = MeanPropensityMeanSquaredError( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + folds=cfg.propensity_mse.folds, + max_iterations=cfg.propensity_mse.max_iterations, + solver=cfg.propensity_mse.solver, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "PROPENSITY MEAN SQUARED ERROR", results) + + if cfg.mutual_information.run: + log(INFO, "Running Mutual Information Difference Evaluation") + metric = MutualInformationDifference( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + include_numerical_columns=cfg.mutual_information.include_numerical_columns, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "MUTUAL INFORMATION DIFFERENCE", results) + + +def run_privacy_evaluations( + cfg: DictConfig, + real_data_train: pd.DataFrame, + synthetic_data: pd.DataFrame, + real_data_test: pd.DataFrame, + meta_info: dict[str, Any], +) -> None: + """ + Run Privacy evaluation metrics. + + Args: + cfg: Configuration information for the evaluation. + real_data_train: Dataframe containing real data to which the synthetic data will be compared (typically used to + train the model that generation the synthetic data). + synthetic_data: Dataframe containing the synthetic data whose quality is to be measured. + real_data_test: Dataframe containing real data to which the synthetic data will be compared (typically + explicitly NOT USED to train the model that generation the synthetic data). + meta_info: Dictionary containing information about the dataframes, including which columns correspond to + numerical and categorical values or whether there is a column corresponding to a label. + """ + metric: MetricBase + # Shared preprocessing for syntheval based metrics if they are to be run + if should_syntheval_preprocess(cfg, for_privacy=True): + log(INFO, "Preprocessing Data with SynthEval pipeline") + numerical_columns, categorical_columns = get_numerical_and_categorical_column_names(real_data_train, meta_info) + + # Categorical values are ordinal encoded, numerical values are min-max scaled + syntheval_real_data_train, syntheval_synthetic_data = syntheval_preprocess( + numerical_columns, categorical_columns, real_data_train, synthetic_data + ) + + if cfg.hitting_rate.run: + log(INFO, "Running Hitting Rate Evaluation") + + metric = HittingRate( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + hitting_threshold=cfg.hitting_rate.hitting_threshold, + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "HITTING RATE", results) + + if cfg.eir.run: + log(INFO, "Running Epsilon Identifiability Rate Evaluation") + # Categorical values are ordinal encoded, numerical values are min-max scaled + metric = EpsilonIdentifiabilityRisk( + categorical_columns=categorical_columns, + numerical_columns=numerical_columns, + norm=EpsilonIdentifiabilityNorm(cfg.eir.norm), + # Already preprocessing above + do_preprocess=False, + ) + results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data) + report_metrics(cfg, "EPSILON IDENTIFIABILITY RISK", results) + + # Shared preprocessing for distance based metrics, if they are to be run. + if any([cfg.dcr.run, cfg.median_dcr.run, cfg.nndr.run]): + log(INFO, "Preprocessing Data for Distance Evaluation") + # Categorical values are one-hot encoded, numerical values are scaled by their range, but not into [0,1] + distance_real_data, distance_synthetic_data, distance_holdout_data = preprocess_for_distance_computation( + meta_info=meta_info, + real_data_train=real_data_train, + synthetic_data=synthetic_data, + real_data_test=real_data_test, + ) + + if cfg.dcr.run: + log(INFO, "Running DCR Evaluation") + metric = DistanceToClosestRecordScore(NormType(cfg.dcr.norm), cfg.dcr.batch_size, do_preprocess=False) + results = metric.compute(distance_real_data, distance_synthetic_data, distance_holdout_data) + report_metrics(cfg, "DISTANCE TO CLOSEST RECORD", results) + + if cfg.median_dcr.run: + log(INFO, "Running Median DCR Evaluation") + metric = MedianDistanceToClosestRecordScore( + NormType(cfg.median_dcr.norm), cfg.median_dcr.batch_size, do_preprocess=False + ) + results = metric.compute(distance_real_data, distance_synthetic_data) + report_metrics(cfg, "MEDIAN DISTANCE TO CLOSEST RECORD", results) + + if cfg.nndr.run: + log(INFO, "Running NNDR Evaluation") + metric = NearestNeighborDistanceRatio(NormType(cfg.nndr.norm), cfg.nndr.batch_size, do_preprocess=False) + results = metric.compute(distance_real_data, distance_synthetic_data, distance_holdout_data) + report_metrics(cfg, "NEAREST NEIGHBOR DISTANCE RATIO", results) + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def main(cfg: DictConfig) -> None: + """Entry point for the evaluation script.""" + log(INFO, "Loading Data for Evaluations") + + full_data_path = Path(cfg.data_dir) / f"{cfg.table_name}.csv" + train_data_path = Path(cfg.data_dir) / f"{cfg.table_name}_sampled.csv" + synthetic_data_path = Path(cfg.results_dir) / cfg.table_name / "synthetic_data" / f"{cfg.table_name}_synthetic.csv" + meta_data_path = Path(cfg.data_dir) / f"{cfg.table_name}_info.json" + + log(INFO, "Making the test dataset...") + + full_dataset = pd.read_csv(full_data_path) + sampled_dataset = pd.read_csv(train_data_path) + merged_dataset = full_dataset.merge(sampled_dataset, how="left", indicator=True) + complement_dataset = merged_dataset.loc[merged_dataset["_merge"] == "left_only"].drop(columns="_merge") + test_dataset = complement_dataset.sample(n=len(sampled_dataset)) + + test_data_path = Path(cfg.data_dir) / f"{cfg.table_name}_test.csv" + test_dataset.to_csv(test_data_path, index=False) + + log(INFO, "Loading the data...") + + real_data_train, synthetic_data, real_data_test, meta_info = load_midst_data_with_test( + train_data_path, + synthetic_data_path, + meta_data_path, + test_data_path, + ) + + run_quality_evaluations(cfg.evaluation, real_data_train, synthetic_data, real_data_test, meta_info) + run_privacy_evaluations(cfg.evaluation, real_data_train, synthetic_data, real_data_test, meta_info) + + +if __name__ == "__main__": + main() diff --git a/examples/tabsyn/synthesize.py b/examples/tabsyn/synthesize.py new file mode 100644 index 00000000..49a8667c --- /dev/null +++ b/examples/tabsyn/synthesize.py @@ -0,0 +1,115 @@ +import json +from logging import INFO +from pathlib import Path + +import hydra +from omegaconf import DictConfig + +from examples.tabsyn.train import train_tabsyn +from midst_toolkit.common.logger import log +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.models.tabsyn.config import load_config +from midst_toolkit.models.tabsyn.dataset import preprocess +from midst_toolkit.models.tabsyn.pipeline import TabSyn +from midst_toolkit.models.tabsyn.preprocessing import get_processed_data_dir + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def tabsyn_synthesize(config: DictConfig) -> None: + """ + Synthesize data using the TabSyn model. + + Args: + config: Configuration as an OmegaConf DictConfig object. + """ + log(INFO, "Synthesizing data using the TabSyn model...") + + results_dir = Path(config.results_dir) + model_save_dir = results_dir / config.table_name + vae_save_dir = model_save_dir / "vae" + + if not (model_save_dir / "model.pt").exists() or not (vae_save_dir / "model.pt").exists(): + log(INFO, "Trained model not found. Training the model...") + train_tabsyn(config) + + else: + log(INFO, "Trained model found.") + + # The preprocess function below expects 2 folders of preprocessed data: + # 1. the dataset to be processed, which can be a subset of the full dataset + # 2. the full dataset, which is used to unsure it will get all the categories for the categorical features + # Here we are making the dataset #2 (full dataset) by copying the dataset #1 + dataset_path = get_processed_data_dir(Path(config.data_dir)) / config.table_name + ref_dataset_path = Path(f"{dataset_path}_all") + + tabsyn_config = load_config(Path(config.tabsyn_config)) + + log(INFO, "Instantiating the TabSyn model...") + + _, _, categories, d_numerical = preprocess( # type: ignore[misc] + dataset_path=dataset_path, + ref_dataset_path=ref_dataset_path, + transforms=tabsyn_config["transforms"], + task_type=tabsyn_config["task_type"], + ) + + # Instantiate an empty model object so we can load the model state from the checkpoint + # TODO: Refactor this constructor to allow for this use case without having to ignore the type checker + tabsyn = TabSyn(None, None, None, d_numerical, categories, device=DEVICE) # type: ignore[arg-type] + + ###### Load the model ###### + + # instantiate VAE model + tabsyn.instantiate_vae(**tabsyn_config["model_params"], optim_params=None) + + # load latent embeddings attributes of input data + train_z_att = tabsyn.load_embeddings_attributes(vae_save_dir) + token_dim = train_z_att["token_dim"] + in_dim = train_z_att["in_dim"] + + # instantiate diffusion model + tabsyn.instantiate_diffusion(in_dim=in_dim, optim_params=None) + + # load state from checkpoint + tabsyn.load_model_state(ckpt_dir=model_save_dir, dif_ckpt_name="model.pt") + + ###### Synthesize data ###### + + # get inverse tokenizers + # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here + _, _, _, _, num_inverse, cat_inverse = preprocess( # type: ignore[misc] + dataset_path=dataset_path, + ref_dataset_path=ref_dataset_path, + transforms=tabsyn_config["transforms"], + task_type=tabsyn_config["task_type"], + inverse=True, + ) + + synthetic_data_dir = results_dir / config.table_name / "synthetic_data" + synthetic_data_dir.mkdir(parents=True, exist_ok=True) + + # load data info file + with open(dataset_path / "info.json", "r") as file: + data_info = json.load(file) + + data_info["token_dim"] = token_dim + + # sample data + num_samples = train_z_att["num_samples"] + in_dim = train_z_att["in_dim"] + mean_input_emb = train_z_att["mean_input_emb"] + tabsyn.sample( + num_samples, + in_dim, + mean_input_emb, + info=data_info, + num_inverse=num_inverse, + cat_inverse=cat_inverse, + save_path=synthetic_data_dir / f"{config.table_name}_synthetic.csv", + ) + + log(INFO, "Synthesizing Done!") + + +if __name__ == "__main__": + tabsyn_synthesize() diff --git a/examples/tabsyn/tabsyn_config.toml b/examples/tabsyn/tabsyn_config.toml new file mode 100644 index 00000000..6b31d7f6 --- /dev/null +++ b/examples/tabsyn/tabsyn_config.toml @@ -0,0 +1,42 @@ +task_type = "regression" + +[model_params] +n_head = 1 +factor = 32 +num_layers = 2 +d_token = 4 + +[transforms] +normalization = "quantile" +numerical_nan_policy = "mean" +categorical_nan_policy = "__none__" +category_minimum_frequency = "__none__" +categorical_encoding = "__none__" +target_policy = "default" + +[train.vae] +num_epochs = 4000 +batch_size = 4096 +num_dataset_workers = 4 + +[train.diffusion] +num_epochs = 10001 +batch_size = 4096 +num_dataset_workers = 4 + +[train.optim.vae] +lr = 1e-3 +weight_decay = 0 +factor = 0.95 +patience = 10 + +[train.optim.diffusion] +lr = 1e-3 +weight_decay = 0 +factor = 0.9 +patience = 20 + +[loss_params] +max_beta = 1e-2 +min_beta = 1e-5 +lambd = 0.7 diff --git a/examples/tabsyn/train.py b/examples/tabsyn/train.py new file mode 100644 index 00000000..66ec8940 --- /dev/null +++ b/examples/tabsyn/train.py @@ -0,0 +1,185 @@ +import shutil +from logging import INFO +from pathlib import Path + +import hydra +import pandas as pd +import torch +from omegaconf import DictConfig +from torch import Tensor +from torch.utils.data import DataLoader + +from midst_toolkit.common.enumerations import DataSplit +from midst_toolkit.common.logger import log +from midst_toolkit.common.variables import DEVICE +from midst_toolkit.models.tabsyn.config import load_config +from midst_toolkit.models.tabsyn.dataset import TabularDataset, preprocess +from midst_toolkit.models.tabsyn.pipeline import TabSyn +from midst_toolkit.models.tabsyn.preprocessing import get_processed_data_dir, process_data + + +@hydra.main(config_path=".", config_name="config", version_base=None) +def train_tabsyn(config: DictConfig) -> None: + """ + Train a TabSyn model. + + Args: + config: Configuration as an OmegaConf DictConfig object. + """ + log(INFO, "Training TabSyn model...") + results_dir = Path(config.results_dir) + results_dir.mkdir(parents=True, exist_ok=True) + + data_name = _sample_data_if_needed( + config.table_name, + Path(config.data_dir), + Path(config.results_dir), + config.get("training", {}).get("sample_size", None), + ) + process_data(config.table_name, Path(config.data_dir), Path(config.data_dir), data_name=data_name) + + tabsyn_config = load_config(Path(config.tabsyn_config)) + + # The preprocess function below expects 2 folders of preprocessed data: + # 1. the dataset to be processed, which can be a subset of the full dataset + # 2. the full dataset, which is used to unsure it will get all the categories for the categorical features + # Here we are making the dataset #2 (full dataset) by copying the dataset #1 + dataset_path = get_processed_data_dir(Path(config.data_dir)) / config.table_name + ref_dataset_path = Path(f"{dataset_path}_all") + shutil.rmtree(ref_dataset_path, ignore_errors=True) + shutil.copytree(dataset_path, ref_dataset_path) + + log(INFO, "Preprocessing data...") + + # preprocess the data + # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here + numerical_features, categorical_features, categories, d_numerical = preprocess( # type: ignore[misc] + dataset_path=dataset_path, + ref_dataset_path=ref_dataset_path, + transforms=tabsyn_config["transforms"], + task_type=tabsyn_config["task_type"], + ) + + # separate train and test data + numerical_features_train = numerical_features[DataSplit.TRAIN.value] + numerical_features_test = numerical_features[DataSplit.TEST.value] + categorical_features_train = categorical_features[DataSplit.TRAIN.value] + categorical_features_test = categorical_features[DataSplit.TEST.value] + + # convert to float tensor + numerical_features_train = torch.tensor(numerical_features_train).float() + numerical_features_test = torch.tensor(numerical_features_test).float() + categorical_features_train = torch.tensor(categorical_features_train) + categorical_features_test = torch.tensor(categorical_features_test) + + log(INFO, "Loading the dataset...") + + # create dataset module + train_data = TabularDataset(numerical_features_train.float(), categorical_features_train) + + # move test data to gpu if available + numerical_features_test = numerical_features_test.float().to(DEVICE) + categorical_features_test = categorical_features_test.to(DEVICE) + + # create train dataloader + train_loader: DataLoader[TabularDataset] = DataLoader[TabularDataset]( + # Ignoring here because this is expecting the dataset to be subclass of torch's Dataset but it isn't + train_data, # type: ignore[arg-type] + batch_size=tabsyn_config["train"]["vae"]["batch_size"], + shuffle=True, + num_workers=tabsyn_config["train"]["vae"]["num_dataset_workers"], + ) + + log(INFO, "Instantiating the TabSyn model...") + + # Instantiate the model + tabsyn = TabSyn( + train_loader, + numerical_features_test, + categorical_features_test, + num_numerical_features=d_numerical, + num_classes=categories, + device=DEVICE, + ) + + model_save_dir = results_dir / config.table_name + vae_save_dir = model_save_dir / "vae" + vae_save_dir.mkdir(parents=True, exist_ok=True) + + ###### A. Train the VAE model ###### + + log(INFO, "Training the TabSyn VAE model...") + + # instantiate VAE model for training + tabsyn.instantiate_vae( + **tabsyn_config["model_params"], + optim_params=tabsyn_config["train"]["optim"]["vae"], + ) + + tabsyn.train_vae( + **tabsyn_config["loss_params"], + num_epochs=tabsyn_config["train"]["vae"]["num_epochs"], + save_path=vae_save_dir, + ) + + # embed all inputs in the latent space + tabsyn.save_vae_embeddings( + numerical_features_train, + categorical_features_train, + vae_ckpt_dir=vae_save_dir, + ) + tabsyn.save_embeddings_attributes(vae_ckpt_dir=vae_save_dir) + + ###### B. Train the Diffusion model ###### + + log(INFO, "Training the TabSyn Diffusion model...") + + # load latent space embeddings + train_z, _ = tabsyn.load_latent_embeddings(vae_save_dir) # train_z dim: B x in_dim + + # normalize embeddings + latent_train_data = (train_z - train_z.mean(0)) / 2 + + # create data loader + latent_train_loader: DataLoader[Tensor] = DataLoader[Tensor]( + # Ignoring the type checker here because our code in tabsyn.train_diffusion + # works with plain Tensor and not with TensorDataset + latent_train_data, # type: ignore[arg-type] + batch_size=tabsyn_config["train"]["diffusion"]["batch_size"], + shuffle=True, + num_workers=tabsyn_config["train"]["diffusion"]["num_dataset_workers"], + ) + + # instantiate diffusion model for training + tabsyn.instantiate_diffusion( + in_dim=train_z.shape[1], + optim_params=tabsyn_config["train"]["optim"]["diffusion"], + ) + + # train diffusion model + tabsyn.train_diffusion( + latent_train_loader, + num_epochs=tabsyn_config["train"]["diffusion"]["num_epochs"], + ckpt_path=model_save_dir, + ) + + log(INFO, "Training Done!") + + +def _sample_data_if_needed(table_name: str, data_dir: Path, results_dir: Path, sample_size: int | None) -> str: + data_name = table_name + if sample_size: + log(INFO, f"Sampling {sample_size} rows from data...") + all_data = pd.read_csv(data_dir / f"{table_name}.csv") + sampled_data = all_data.sample(n=sample_size) + results_dir.mkdir(parents=True, exist_ok=True) + + data_name = f"{table_name}_sampled" + sampled_data.to_csv(results_dir / f"{data_name}.csv", index=False) + sampled_data.to_csv(data_dir / f"{data_name}.csv", index=False) + + return data_name + + +if __name__ == "__main__": + train_tabsyn() diff --git a/src/midst_toolkit/attacks/ensemble/models.py b/src/midst_toolkit/attacks/ensemble/models.py index 5326faa6..261d2211 100644 --- a/src/midst_toolkit/attacks/ensemble/models.py +++ b/src/midst_toolkit/attacks/ensemble/models.py @@ -2,19 +2,24 @@ import copy import json +import shutil +import tempfile from abc import ABC, abstractmethod from logging import INFO from pathlib import Path from typing import Any import pandas as pd +import torch from omegaconf import DictConfig from pydantic import BaseModel, ConfigDict from sdv.metadata import SingleTableMetadata from sdv.single_table import CTGANSynthesizer +from torch.utils.data import DataLoader from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning -from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig +from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, GeneralConfig, TrainingConfig +from midst_toolkit.common.enumerations import DataSplit from midst_toolkit.common.logger import log from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.clavaddpm.clustering import clava_clustering @@ -22,6 +27,10 @@ from midst_toolkit.models.clavaddpm.enumerations import GroupLengthsProbDicts, Relation, RelationOrder from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, CTGANModelArtifacts, clava_training +from midst_toolkit.models.tabsyn.config import load_config +from midst_toolkit.models.tabsyn.dataset import TabularDataset, preprocess +from midst_toolkit.models.tabsyn.pipeline import TabSyn +from midst_toolkit.models.tabsyn.preprocessing import get_processed_data_dir, process_data class EnsembleAttackTrainingResult(BaseModel): @@ -343,3 +352,298 @@ def get_single_table_svd_metadata( metadata.remove_primary_key() return metadata, data_without_ids + + +class EnsembleAttackTabSynModelRunner(EnsembleAttackModelRunner): + def __init__(self, config: DictConfig): + """ + Initialize the ensemble attack model runner for the TabSyn model with a config dictionary. + + Args: + config: The config from the config.yaml file for the ensemble attack model. + Must contain the following keys: + - tabsyn_config: The tabsyn config path for the TabSyn model. + - data_dir: The data directory for the TabSyn model. + - results_dir: The results directory for the TabSyn model. + - table_name: The name of the table the TabSyn model is being trained on. + """ + self.tabsyn_config = load_config(config.tabsyn_config) + self.table_name = config.table_name + self.full_data_dir = Path(config.data_dir) + self.training_config = TrainingConfig( + general=GeneralConfig( + data_dir=self.full_data_dir, + test_data_dir=Path(f"{self.full_data_dir}-test"), + exp_name=f"{config.table_name}-ensemble-attack", + workspace_dir=Path(config.results_dir), + sample_prefix=f"{config.table_name}-ea", + ), + save_dir=Path(config.results_dir) / self.table_name, + ) + + log(INFO, "Processing TabSyn full dataset...") + process_data(self.table_name, self.full_data_dir, self.full_data_dir) + + @property + def vae_save_dir(self) -> Path: + """ + Get the directory to save the VAE model. + + Will return the save directory relative to the current self.training_config.save_dir. + """ + assert self.training_config.save_dir is not None, "Save dir is not set" + return self.training_config.save_dir / "vae" + + def train_or_fine_tune_and_synthesize( + self, + dataset: pd.DataFrame, + synthesize: bool = True, + trained_model: EnsembleAttackTrainingResult | None = None, + ) -> EnsembleAttackTrainingResult: + """ + Train or fine tune a TabSyn model on the provided dataset and optionally synthesize data. + + Args: + dataset: The dataset as a pandas DataFrame. + synthesize: Flag indicating whether to generate synthetic data after training. + The number of points to synthesize and the save directory is controlled by + the `number_of_points_to_synthesize` and `save_dir` attributes of the training + config. Optional, default is True. + trained_model: The trained model to fine tune. If None, a new model will be trained. + """ + log(INFO, "Training or Fine Tuning TabSyn model...") + self.training_config.general.workspace_dir.mkdir(parents=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + tabsyn = copy.deepcopy(trained_model.models) if trained_model is not None else None + + temp_path = self._populate_temp_dataset_dir(temp_dir, dataset) + tabsyn = self._train_or_fine_tune(temp_path, tabsyn) + + synthetic_data: pd.DataFrame | None = None + if synthesize: + synthetic_data = self._synthesize(tabsyn) + + return EnsembleAttackTrainingResult( + save_dir=self.training_config.general.workspace_dir, + configs=self.training_config, + models=tabsyn, + synthetic_data=synthetic_data, + ) + + def _populate_temp_dataset_dir(self, temp_dir: str, dataset: pd.DataFrame) -> Path: + temp_path = Path(temp_dir) + + dataset.to_csv(temp_path / f"{self.table_name}.csv", index=False) + for filename in [f"{self.table_name}.json", f"{self.table_name}_info.json"]: + shutil.copy(self.full_data_dir / filename, temp_path / filename) + + return temp_path + + def _dataset_and_ref_dataset_paths(self, data_dir: Path) -> tuple[Path, Path]: + # The preprocess function below expects 2 folders of preprocessed data: + # 1. the dataset to be processed, which can be a subset of the full dataset + # 2. the full dataset, which is used to unsure it will get all the categories for the categorical features + dataset_path = get_processed_data_dir(data_dir) / self.table_name + full_dataset_path = get_processed_data_dir(self.full_data_dir) / self.table_name + ref_dataset_path = Path(f"{dataset_path}_all") + shutil.rmtree(ref_dataset_path, ignore_errors=True) + shutil.copytree(full_dataset_path, ref_dataset_path) + + return dataset_path, ref_dataset_path + + def _train_or_fine_tune(self, data_dir: Path, tabsyn: TabSyn | None) -> TabSyn: + fine_tune = tabsyn is not None + log(INFO, f"{'Fine Tuning' if fine_tune else 'Training'} new TabSyn model...") + + assert self.training_config.save_dir is not None, "Save dir is not set" + + process_data(self.table_name, data_dir, data_dir) + + log(INFO, "Preprocessing data...") + + dataset_path, ref_dataset_path = self._dataset_and_ref_dataset_paths(data_dir) + + # preprocess the data + # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here + numerical_features, categorical_features, categories, d_numerical = preprocess( # type: ignore[misc] + dataset_path=dataset_path, + ref_dataset_path=ref_dataset_path, + transforms=self.tabsyn_config["transforms"], + task_type=self.tabsyn_config["task_type"], + ) + + # separate train and test data + numerical_features_train = numerical_features[DataSplit.TRAIN.value] + numerical_features_test = numerical_features[DataSplit.TEST.value] + categorical_features_train = categorical_features[DataSplit.TRAIN.value] + categorical_features_test = categorical_features[DataSplit.TEST.value] + + log(INFO, f"Dataset size: {numerical_features_train.shape[0]}") + + # convert to float tensor + numerical_features_train = torch.tensor(numerical_features_train).float() + numerical_features_test = torch.tensor(numerical_features_test).float() + categorical_features_train = torch.tensor(categorical_features_train) + categorical_features_test = torch.tensor(categorical_features_test) + + log(INFO, "Loading the dataset...") + + # create dataset module + train_data = TabularDataset(numerical_features_train.float(), categorical_features_train) + + # move test data to gpu if available + numerical_features_test = numerical_features_test.float().to(DEVICE) + categorical_features_test = categorical_features_test.to(DEVICE) + + # create train dataloader + train_loader: DataLoader[TabularDataset] = DataLoader[TabularDataset]( + # Ignoring here because this is expecting the dataset to be subclass of torch's Dataset but it isn't + train_data, # type: ignore[arg-type] + batch_size=self.tabsyn_config["train"]["vae"]["batch_size"], + shuffle=True, + num_workers=self.tabsyn_config["train"]["vae"]["num_dataset_workers"], + ) + + if fine_tune: + # Load the data into the existing model + log(INFO, "Loading data into existing TabSyn model...") + assert tabsyn is not None, "TabSyn model is not set" + tabsyn.train_loader = train_loader + tabsyn.numerical_features_test = numerical_features_test + tabsyn.categorical_features_test = categorical_features_test + else: + # Instantiate a new model + log(INFO, "Instantiating the TabSyn model...") + tabsyn = TabSyn( + train_loader, + numerical_features_test, + categorical_features_test, + num_numerical_features=d_numerical, + num_classes=categories, + device=DEVICE, + ) + + ###### A. Train the VAE model ###### + + log(INFO, f"{'Fine tuning' if fine_tune else 'Training'} the TabSyn VAE model...") + + self.vae_save_dir.mkdir(parents=True, exist_ok=True) + if not fine_tune: + # instantiate VAE model for training + tabsyn.instantiate_vae( + **self.tabsyn_config["model_params"], + optim_params=self.tabsyn_config["train"]["optim"]["vae"], + ) + + tabsyn.train_vae( + **self.tabsyn_config["loss_params"], + num_epochs=self.tabsyn_config["train"]["vae"]["num_epochs"], + save_path=self.vae_save_dir, + ) + + # embed all inputs in the latent space + tabsyn.save_vae_embeddings( + numerical_features_train, + categorical_features_train, + vae_ckpt_dir=self.vae_save_dir, + ) + tabsyn.save_embeddings_attributes(vae_ckpt_dir=self.vae_save_dir) + + ###### B. Train the Diffusion model ###### + + log(INFO, f"{'Fine tuning' if fine_tune else 'Training'} the TabSyn Diffusion model...") + + # load latent space embeddings + train_z, _ = tabsyn.load_latent_embeddings(self.vae_save_dir) # train_z dim: B x in_dim + + # normalize embeddings + latent_train_data = (train_z - train_z.mean(0)) / 2 + + # create data loader + latent_train_loader: DataLoader[torch.Tensor] = DataLoader[torch.Tensor]( + # Ignoring the type checker here because our code in tabsyn.train_diffusion + # works with plain Tensor and not with TensorDataset + latent_train_data, # type: ignore[arg-type] + batch_size=self.tabsyn_config["train"]["diffusion"]["batch_size"], + shuffle=True, + num_workers=self.tabsyn_config["train"]["diffusion"]["num_dataset_workers"], + ) + + if not fine_tune: + # instantiate diffusion model for training + tabsyn.instantiate_diffusion( + in_dim=train_z.shape[1], + optim_params=self.tabsyn_config["train"]["optim"]["diffusion"], + ) + + # train diffusion model + tabsyn.train_diffusion( + latent_train_loader, + num_epochs=self.tabsyn_config["train"]["diffusion"]["num_epochs"], + ckpt_path=self.training_config.save_dir, + ) + + log(INFO, f"{'Fine tuning' if fine_tune else 'Training'} Done!") + + return tabsyn + + def _synthesize(self, tabsyn: TabSyn) -> pd.DataFrame: + ###### Load the model ###### + + assert self.training_config.save_dir is not None, "Save dir is not set" + + # instantiate VAE model + tabsyn.instantiate_vae(**self.tabsyn_config["model_params"], optim_params=None) + + # load latent embeddings attributes of input data + train_z_att = tabsyn.load_embeddings_attributes(self.vae_save_dir) + token_dim = train_z_att["token_dim"] + in_dim = train_z_att["in_dim"] + + # instantiate diffusion model + tabsyn.instantiate_diffusion(in_dim=in_dim, optim_params=None) + + # load state from checkpoint + tabsyn.load_model_state(ckpt_dir=self.training_config.save_dir, dif_ckpt_name="model.pt") + + ###### Synthesize data ###### + + dataset_path, ref_dataset_path = self._dataset_and_ref_dataset_paths(self.full_data_dir) + + # get inverse tokenizers + # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here + _, _, _, _, num_inverse, cat_inverse = preprocess( # type: ignore[misc] + dataset_path=dataset_path, + ref_dataset_path=ref_dataset_path, + transforms=self.tabsyn_config["transforms"], + task_type=self.tabsyn_config["task_type"], + inverse=True, + ) + + synthetic_data_dir = self.training_config.save_dir / "synthetic_data" + synthetic_data_dir.mkdir(parents=True, exist_ok=True) + + # load data info file + with open(dataset_path / "info.json", "r") as file: + data_info = json.load(file) + + data_info["token_dim"] = token_dim + + # sample data + num_samples = train_z_att["num_samples"] + in_dim = train_z_att["in_dim"] + mean_input_emb = train_z_att["mean_input_emb"] + tabsyn.sample( + num_samples, + in_dim, + mean_input_emb, + info=data_info, + num_inverse=num_inverse, + cat_inverse=cat_inverse, + save_path=synthetic_data_dir / f"{self.table_name}_synthetic.csv", + ) + + log(INFO, "Synthesizing Done!") + + return pd.read_csv(synthetic_data_dir / f"{self.table_name}_synthetic.csv") diff --git a/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py b/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py index b41c7cbf..7ddf4148 100644 --- a/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py +++ b/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py @@ -123,6 +123,9 @@ def compute_gower_for_model( if len(df_synthetic) > min_length: df_synthetic = df_synthetic.sample(n=min_length, random_state=random_seed) + # Drop id columns + id_columns_names = [col for col in df_synthetic.columns if "_id" in col] + df_synthetic = df_synthetic.drop(columns=id_columns_names) if id_column_name in df_synthetic.columns: df_synthetic = df_synthetic.drop(columns=[id_column_name]) diff --git a/src/midst_toolkit/models/tabsyn/dataset.py b/src/midst_toolkit/models/tabsyn/dataset.py index e720d46c..5761cd57 100644 --- a/src/midst_toolkit/models/tabsyn/dataset.py +++ b/src/midst_toolkit/models/tabsyn/dataset.py @@ -52,11 +52,7 @@ def preprocess( task_type: TaskType = TaskType.BINARY_CLASSIFICATION, inverse: bool = False, concat: bool = True, -) -> ( - Dataset - | tuple[ArrayDict, ArrayDict, list[int] | None, int] - | tuple[ArrayDict, ArrayDict, list[int] | None, int, Any, Any] -): +) -> Dataset | tuple[ArrayDict, ArrayDict, list[int], int] | tuple[ArrayDict, ArrayDict, list[int], int, Any, Any]: """Preprocess the dataset. Args: @@ -197,7 +193,7 @@ def make_dataset( return transform_dataset(dataset, transformations, None) -def get_categories(categorical_features_train: np.ndarray | None) -> list[int] | None: +def get_categories(categorical_features_train: np.ndarray | None) -> list[int]: """Get the length of the unique categories from the categorical features. Args: @@ -207,7 +203,7 @@ def get_categories(categorical_features_train: np.ndarray | None) -> list[int] | The length of the unique categories for each feature. """ if categorical_features_train is None: - return None + return [] return [len(set(categorical_features_train[:, i])) for i in range(categorical_features_train.shape[1])] diff --git a/src/midst_toolkit/models/tabsyn/pipeline.py b/src/midst_toolkit/models/tabsyn/pipeline.py index 8977f834..e3ee667e 100644 --- a/src/midst_toolkit/models/tabsyn/pipeline.py +++ b/src/midst_toolkit/models/tabsyn/pipeline.py @@ -60,9 +60,22 @@ def __init__( self.device = device def instantiate_vae( - self, n_head: int, factor: int, num_layers: int, d_token: int, optim_params: dict[str, Any] + self, + n_head: int, + factor: int, + num_layers: int, + d_token: int, + optim_params: dict[str, Any] | None = None, ) -> None: - """Construct VAE model and its optimizer and lr scheduler.""" + """Construct VAE model and its optimizer and lr scheduler. + + Args: + n_head: The number of heads. + factor: The factor for the dimension of the hidden layer. + num_layers: The number of layers. + d_token: The dimension of the token. + optim_params: The optimizer parameters. Optional, defaults to None. + """ # construct vae model self.vae_model, self.pre_encoder, self.pre_decoder = self.__get_vae_model(n_head, factor, num_layers, d_token) # construct vae optimizer and scheduler @@ -70,8 +83,13 @@ def instantiate_vae( self.vae_optimizer, self.vae_scheduler = self.__load_optim(self.vae_model, **optim_params) log(INFO, "Successfully instantiated VAE model.") - def instantiate_diffusion(self, in_dim: int, optim_params: dict[str, Any]) -> None: - """Construct Diffusion model and its optimizer and lr scheduler.""" + def instantiate_diffusion(self, in_dim: int, optim_params: dict[str, Any] | None) -> None: + """Construct Diffusion model and its optimizer and lr scheduler. + + Args: + in_dim: The dimension of the input. + optim_params: The optimizer parameters. Optional, defaults to None. + """ # load diffusion model self.diffusion_model = self.__get_diffusion_model(input_dimension=in_dim) # load optimizer and scheduler diff --git a/src/midst_toolkit/models/tabsyn/preprocessing.py b/src/midst_toolkit/models/tabsyn/preprocessing.py index 7f790ce3..0d78378e 100644 --- a/src/midst_toolkit/models/tabsyn/preprocessing.py +++ b/src/midst_toolkit/models/tabsyn/preprocessing.py @@ -125,7 +125,7 @@ def train_val_test_split( # TODO: Refactor this function to get rid of the "too many statements" and "too many branches" errors from Ruff -def process_data(name: str, info_path: Path, data_dir: Path) -> None: +def process_data(name: str, info_path: Path, data_dir: Path, data_name: str | None = None) -> None: # ruff: noqa: PLR0915, PLR0912 """Process the data for the given dataset name. @@ -135,17 +135,39 @@ def process_data(name: str, info_path: Path, data_dir: Path) -> None: name: The name of the dataset. info_path: The directory where the info file is located. data_dir: The directory where the raw data is located. + data_name: The name of the data file. If not provided, the data file will be named after the dataset name. """ + if data_name is None: + data_name = name + + raw_data_dir = data_dir / "raw_data" processed_data_dir = get_processed_data_dir(data_dir) - with open(info_path / f"{name}.json", "r") as f: + with open(info_path / f"{name}_info.json", "r") as f: info = json.load(f) - data_path = info["data_path"] - if info["file_type"] == "csv": + data_path: Path + test_path: Path | None = None + if (raw_data_dir / "train.csv").exists(): + data_path = raw_data_dir / "train.csv" + test_path = raw_data_dir / "test.csv" + elif (raw_data_dir / "train.xls").exists(): + data_path = raw_data_dir / "train.xls" + test_path = raw_data_dir / "test.xls" + else: + data_path = data_dir / f"{data_name}.csv" + + assert data_path.exists(), ( + f"Train data not found in the expected paths. Expected paths are: {data_dir}/{data_name}.csv, {raw_data_dir}/train.csv, {raw_data_dir}/train.xls." + ) + assert test_path is None or test_path.exists(), ( + f"Test data path not found in the expected paths. Expected paths are: {raw_data_dir}/test.csv, {raw_data_dir}/test.xls." + ) + + if data_path.suffix == ".csv": data_df = pd.read_csv(data_path, header=info["header"]) - elif info["file_type"] == "xls": + elif data_path.suffix == ".xls": data_df = pd.read_excel(data_path, sheet_name="Data", header=1) data_df = data_df.drop("ID", axis=1) @@ -168,9 +190,8 @@ def process_data(name: str, info_path: Path, data_dir: Path) -> None: cat_columns = [column_names[i] for i in cat_col_idx] target_columns = [column_names[i] for i in target_col_idx] - if info["test_path"]: + if test_path: # if testing data is given - test_path = info["test_path"] test_df = pd.read_csv(test_path) train_df = data_df else: diff --git a/tests/integration/assets/tabsyn/trans.json b/tests/integration/assets/tabsyn/trans_info.json similarity index 71% rename from tests/integration/assets/tabsyn/trans.json rename to tests/integration/assets/tabsyn/trans_info.json index 2ffb2563..7bb3da33 100644 --- a/tests/integration/assets/tabsyn/trans.json +++ b/tests/integration/assets/tabsyn/trans_info.json @@ -25,8 +25,5 @@ ], "target_col_idx": [ 3 - ], - "file_type": "csv", - "data_path": "tests/integration/assets/tabsyn/raw_data/train.csv", - "test_path": "tests/integration/assets/tabsyn/raw_data/test.csv" + ] } diff --git a/tests/integration/models/tabsyn/test_train.py b/tests/integration/models/tabsyn/test_train.py index 596cab69..c2ed6c7b 100644 --- a/tests/integration/models/tabsyn/test_train.py +++ b/tests/integration/models/tabsyn/test_train.py @@ -7,6 +7,7 @@ from torch.utils.data import DataLoader from midst_toolkit.common.enumerations import DataSplit +from midst_toolkit.common.variables import DEVICE from midst_toolkit.models.tabsyn.config import load_config from midst_toolkit.models.tabsyn.dataset import TabularDataset, preprocess from midst_toolkit.models.tabsyn.pipeline import TabSyn @@ -67,9 +68,8 @@ def test_train_load_and_synthesize(test_dirs): train_data = TabularDataset(numerical_features_train.float(), categorical_features_train) # move test data to gpu if available - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - numerical_features_test = numerical_features_test.float().to(device) - categorical_features_test = categorical_features_test.to(device) + numerical_features_test = numerical_features_test.float().to(DEVICE) + categorical_features_test = categorical_features_test.to(DEVICE) # create train dataloader train_loader = DataLoader( @@ -86,7 +86,7 @@ def test_train_load_and_synthesize(test_dirs): categorical_features_test, num_numerical_features=d_numerical, num_classes=categories, - device=device, + device=DEVICE, ) model_save_dir = results_dir / test_data_name