diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
index f9cd2a44..eb30cc28 100644
--- a/.github/workflows/code_checks.yml
+++ b/.github/workflows/code_checks.yml
@@ -29,10 +29,10 @@ jobs:
   run-code-check:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 013f410c..6f9fa896 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -42,10 +42,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v6.0.2
+        uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v8.1.0
+        uses: astral-sh/setup-uv@v8.2.0
         with:
           version: "0.5.21"
           enable-cache: true
@@ -77,7 +77,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v6.0.2
+        uses: actions/checkout@v7.0.0
 
       - name: Configure Git Credentials
         run: |
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index e31d69ed..e108d385 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -41,10 +41,10 @@ jobs:
   integration-tests:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 038d39d6..4f0b855f 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -16,10 +16,10 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install libcurl4-openssl-dev libssl-dev
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index fb67d417..37c65ed4 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -41,10 +41,10 @@ jobs:
   unit-tests:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"
diff --git a/examples/gan/README.md b/examples/gan/README.md
index 9764e423..8822a748 100644
--- a/examples/gan/README.md
+++ b/examples/gan/README.md
@@ -94,7 +94,7 @@ Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix
 and Mutual Information Difference.
 
 To compute those metrics, you can run the command below. The name of the table should be
-defined in the `dataset_meta.json` file, and the file for synthetic data should be under
+defined in the `dataset_meta.json` file, and the data files should be under
 `/data/{table_name}.csv` for the real data and `/results/{table_name}_synthetic.csv`
 for the synthetic data.
 
diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md
index 8b3295c2..0a3ad963 100644
--- a/examples/gan/ensemble_attack/README.md
+++ b/examples/gan/ensemble_attack/README.md
@@ -60,10 +60,6 @@ python -m examples.gan.ensemble_attack.make_challenge_dataset
 
 ## 4. Training the attack model
 
-> [!NOTE]
-> In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_training.model_name`
-> is what determines this attack will be run with the CTGAN model.
-
 To train the attack models, execute the following command:
 
 ```bash
@@ -81,7 +77,7 @@ To test the attack model against the target model and synthetic data produced on
 [step 2](#2-generating-target-synthetic-data-to-be-tested), please run:
 
 ```bash
-python -m examples.gan.ensemble_attack.test_attack_model
+python -m examples.gan.ensemble_attack.run_test_attack_model
 ```
 
 ## 6. Compute the attack success
diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml
index 55b083a6..b123f629 100644
--- a/examples/gan/ensemble_attack/config.yaml
+++ b/examples/gan/ensemble_attack/config.yaml
@@ -36,7 +36,6 @@ ensemble_attack:
     run_metaclassifier_training: true
 
   shadow_training:
-    model_name: ctgan
     model_config:  # Configurations specific for the CTGAN model
       training:
         epochs: 300
diff --git a/examples/gan/ensemble_attack/make_challenge_dataset.py b/examples/gan/ensemble_attack/make_challenge_dataset.py
index 1ab30ea9..8eb7c09c 100644
--- a/examples/gan/ensemble_attack/make_challenge_dataset.py
+++ b/examples/gan/ensemble_attack/make_challenge_dataset.py
@@ -39,6 +39,8 @@ def make_challenge_dataset(config: DictConfig) -> None:
     log(INFO, f"Saving challenge labels to {challenge_label_path}")
     np.save(challenge_label_path, challenge_data_labels)
 
+    log(INFO, "Done!")
+
 
 if __name__ == "__main__":
     make_challenge_dataset()
diff --git a/examples/gan/ensemble_attack/test_attack_model.py b/examples/gan/ensemble_attack/run_test_attack_model.py
similarity index 100%
rename from examples/gan/ensemble_attack/test_attack_model.py
rename to examples/gan/ensemble_attack/run_test_attack_model.py
diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py
index e650db3d..83b2b6fc 100644
--- a/examples/gan/ensemble_attack/utils.py
+++ b/examples/gan/ensemble_attack/utils.py
@@ -40,6 +40,14 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]:
     Returns:
         The ensemble attack training config for the CTGAN model.
     """
+    base_data_dir = str
+    if "base_data_dir" in config:
+        base_data_dir = config.base_data_dir
+    elif "data_dir" in config:
+        base_data_dir = config.data_dir
+    else:
+        raise ValueError("Either base_data_dir or data_dir must be provided in the config.")
+
     # Saving the model config from the config.yaml into a json file
     # because that's what the ensemble attack code will be looking for
     training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path)
@@ -48,10 +56,10 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]:
         training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config, resolve=True)
         assert isinstance(training_config, dict), "Training config must be a dictionary."
         training_config["general"] = {
-            "test_data_dir": config.base_data_dir,
+            "test_data_dir": base_data_dir,
             "sample_prefix": "ctgan",
-            "data_dir": config.base_data_dir,
-            "workspace_dir": str(Path(config.base_data_dir) / "shadow_workspace"),
+            "data_dir": base_data_dir,
+            "workspace_dir": str(Path(base_data_dir) / "shadow_workspace"),
             "exp_name": "pre_trained_model",
         }
         json.dump(training_config, f)
diff --git a/examples/tabsyn/README.md b/examples/tabsyn/README.md
new file mode 100644
index 00000000..53c3e8cf
--- /dev/null
+++ b/examples/tabsyn/README.md
@@ -0,0 +1,92 @@
+# TabSyn Single Table Example
+
+This example will go over training a single-table [TabSyn](https://arxiv.org/abs/2310.09656)
+model and synthesizing data afterwards.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1HTgfgeL5GXc8uAGfeQirJrUynK7vFeyb/view?usp=drive_link),
+extract the files and place them in a `/data` folder in within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+Here is a description of the files that have been extracted:
+- `trans.csv`: The training data. It consists of information about bank transactions and it
+contains 20,000 data points.
+- `trans_info.json`: Metadata about the `trans.csv` data, with information such as which columns are
+numerical and which are categorical, what is the task type, etc.
+
+
+## Kicking off training
+
+To kick off training, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.tabsyn.train
+```
+
+
+## Training results
+
+The result files will be saved inside a `/results` folder within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+In the `/results/trans` folder, there will be a file called `model.pt`,
+which is a pytorch saved model.
+
+
+## Synthesizing data
+
+To synthesize some data with the trained model, run:
+
+```bash
+python -m examples.tabsyn.synthesize
+```
+
+If there is already a trained model in the `/results` folder, it will use that model.
+Otherwise it will train one from scratch. At the end of the script, it will save the
+synthesized data to `/results/trans/synthetic_data/trans_synthetic.csv`.
+
+
+## Evaluating the quality of the synthetic data
+
+### Alpha Precision
+
+To run a round of evaluation with [Alpha Precision](https://arxiv.org/abs/2301.07573)
+metrics on a set of synthetic data, run the `evaluate.py` script:
+
+```bash
+python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \
+  --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \
+  --real_data examples/tabsyn/data/trans_sampled.csv \
+  --meta_info_path examples/gan/data/meta_info.json \
+  --save_directory examples/gan/results/
+```
+
+It will save the evaluation results under the `/results/model.txt` file.
+
+### Additional Metrics
+
+The calculation of additional metrics are set up in the `evaluate.py` file. They are the
+Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix Difference
+and Mutual Information Difference.
+
+To compute those metrics, you can run the command below. The data files should
+be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_samples.csv`
+for the sampleed data used for training, and `/results/{table_name}_synthetic.csv`
+for the synthetic data.
+
+```bash
+python -m examples.tabsyn.evaluate
+```
+
+The results will be saved in the `/results/evaluation.json` file.
diff --git a/examples/tabsyn/config.yaml b/examples/tabsyn/config.yaml
new file mode 100644
index 00000000..d15bd606
--- /dev/null
+++ b/examples/tabsyn/config.yaml
@@ -0,0 +1,85 @@
+data_dir: examples/tabsyn/data
+results_dir: examples/tabsyn/results
+tabsyn_config: examples/tabsyn/tabsyn_config.toml
+table_name: trans
+
+training:
+  sample_size: 20000
+
+evaluation:
+  # Reporting Configuration
+  write_report: True
+  metric_report_path: ${results_dir}/evaluation_results.txt
+
+  # Privacy Evaluation Configurations
+  dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  median_dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  hitting_rate:
+    run: True
+    hitting_threshold: 0.03
+
+  eir:
+    run: True
+    norm: "gower"
+
+  nndr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  # Quality Evaluation Configurations
+  ks_tv:
+    run: True
+    significance_level: 0.05
+    permutations: 1000
+
+  alpha_precision:
+    run: True
+    naive_only: False
+
+  ci_overlap:
+    run: True
+    confidence_level: 95
+
+  correlation_diff:
+    run: True
+    compute_mixed_correlations: True
+
+  mean_diff:
+    run: True
+
+  f1_score_diff:
+    run: True
+    label_column: "trans_type"
+    folds: 5
+    f1_type: "macro"
+
+  regression_score_diff:
+    run: True
+    label_column: "trans_type"
+    preprocess_labels: True
+    measure_metrics_in_original_label_space: False
+    verbose: True
+    convert_label_to_float: True
+
+  hellinger:
+    run: True
+    include_numerical_columns: True
+
+  propensity_mse:
+    run: True
+    folds: 5
+    max_iterations: 50
+    solver: "liblinear"
+
+  mutual_information:
+    run: True
+    include_numerical_columns: False
diff --git a/examples/tabsyn/ensemble_attack/README.md b/examples/tabsyn/ensemble_attack/README.md
new file mode 100644
index 00000000..00054d2f
--- /dev/null
+++ b/examples/tabsyn/ensemble_attack/README.md
@@ -0,0 +1,95 @@
+# TabSyn Ensemble Attack Example
+
+On this example, we demonstrate how to run the [Ensemble Attack](../../ensemble_attack/README.md)
+using the [TabSyn](https://arxiv.org/abs/2310.09656) model.
+
+## 1. Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/16XCa63eD2dZ1bddhgRbGFuzAuuMlto9P/view?usp=sharing),
+extract the files and place them in a `/data/ensemble_attack` folder within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+Here is a description of the files that have been extracted:
+- `trans.csv`: The full set of training data.
+- `dataset_meta.json`: Metadata about the relationship between the tables in the dataset. Since this is a
+single table dataset, it will only contain information about the transaction (`trans`) table.
+- `trans_domain.json`: Metadata about the columns of the transaction table, such as their size
+and type (`continuous` or `discrete`).
+- `trans_info.json` and `meta_info.json`: Metadata about the `trans.csv` data, with information
+such as which columns are numerical and which are categorical, what is the task type, etc.
+- `data_types.json`: Additional metadata about the columns, splitting them into 4 types:
+    - `numerical`: a list of the columns that contain numerical information
+    - `categorical`: a list of the columns that contain categorical information
+    - `variable_to_predict`: the name of the target column that will be predicted
+    - `id_column_name`: the name of the column in the table that represents the rows' id.
+
+With the data present in the correct folder, we can proceed with running the attack.
+
+## 2. Generating target synthetic data to be tested
+
+The **target model** is the model being attacked, and the **target synthetic data**
+is the synthetic data generated by the target model that will be evaluated against
+the attack.
+
+If you already have a set of synthetic data produced by a target model,
+you can add its path to the `ensemble_attack.target_model.target_synthetic_data_path`
+property in the [`config.yaml`](config.yaml) file and skip this step.
+
+If you wish to train a new target model and produce the synthetic data that will be the
+target of the attack, you can run:
+
+```bash
+python -m examples.tabsyn.synthesize --config-path=./ensemble_attack
+```
+
+## 3. Producing the challenge points dataset
+
+The challenge points dataset is composed of real data points where half of them
+were used in training the target model and half weren't. It is the dataset we are going
+to use to evaluate how good the attack model is in differentiating between
+the points used in training and the ones not used in training.
+
+To produce such dataset, run the following script:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.make_challenge_dataset
+```
+
+## 4. Training the attack model
+
+To train the attack models, execute the following command:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.train_attack_model
+```
+
+This will take a long time to run, so it might be a good idea to execute it as a
+background process. If you want to have a quick test run before kicking off the
+full process, you can change the number of iterations, epochs, population and
+sample sizes to smaller numbers.
+
+## 5. Testing the attack model
+
+To test the attack model against the target model and synthetic data produced on
+[step 2](#2-generating-target-synthetic-data-to-be-tested), please run:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.run_test_attack_model
+```
+
+## 6. Compute the attack success
+
+To compute the metrics about the success of the attack against the target
+synthetic data, you can run the following command:
+
+```bash
+python -m examples.tabsyn.ensemble_attack.compute_attack_success
+```
+
+The results will both printed on the console and saved in the file
+`examples/tabsyn/results/attack_success_for_xgb_metaclassifier_model.txt`
diff --git a/examples/tabsyn/ensemble_attack/compute_attack_success.py b/examples/tabsyn/ensemble_attack/compute_attack_success.py
new file mode 100644
index 00000000..c8c09f99
--- /dev/null
+++ b/examples/tabsyn/ensemble_attack/compute_attack_success.py
@@ -0,0 +1,30 @@
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.ensemble_attack.compute_attack_success import compute_attack_success_for_given_targets
+from midst_toolkit.common.logger import log
+
+
+@hydra.main(config_path="./", config_name="config", version_base=None)
+def compute_attack_success(config: DictConfig) -> None:
+    """Main function to compute the attack success."""
+    log(
+        INFO,
+        f"Computing attack success for target synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...",
+    )
+
+    compute_attack_success_for_given_targets(
+        target_model_config=config.ensemble_attack.target_model,
+        # TODO: refactor this to work better outside of the challenge context (i.e. no target ID)
+        # No target ID needed for CTGAN, but it needs at least one element in this array. The value does not matter.
+        target_ids=[0],
+        experiment_directory=Path(config.results_dir),
+        metaclassifier_model_name=config.ensemble_attack.metaclassifier.meta_classifier_model_name,
+    )
+
+
+if __name__ == "__main__":
+    compute_attack_success()
diff --git a/examples/tabsyn/ensemble_attack/config.yaml b/examples/tabsyn/ensemble_attack/config.yaml
new file mode 100644
index 00000000..93613222
--- /dev/null
+++ b/examples/tabsyn/ensemble_attack/config.yaml
@@ -0,0 +1,159 @@
+# Training example configuration
+# Base data directory (can be overridden from command line)
+data_dir: examples/tabsyn/ensemble_attack/data
+results_dir: examples/tabsyn/ensemble_attack/results
+tabsyn_config: examples/tabsyn/tabsyn_config.toml
+table_name: trans
+data_file_name: ${table_name}.csv
+
+training:
+  sample_size: 20000  # number of rows to sample from the real data for training
+
+ensemble_attack:
+  random_seed: null  # Set this to a value if you want to set a random seed for reproducibility
+  table_id_column_name: "trans_id"
+  data_file_name: ${data_file_name}
+
+  data_paths:
+    processed_attack_data_path: ${data_dir}
+    population_path: ${data_dir}  # This is the population data that the attacker has collected or has access to.
+    attack_evaluation_result_path: ${results_dir}/evaluation_results # Path where the attack evaluation results will be stored
+
+  data_processing_config:
+    column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
+    population_sample_size: 40000 # Population size is the total data that your attack has access to.
+
+  pipeline:
+    #  TODO: properly test these
+    run_data_processing: true # Set this to false if you have already saved the processed data
+    run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
+    run_metaclassifier_training: true
+
+  shadow_training:
+    model_name: ctgan
+    model_config:  # Configurations specific for the CTGAN model
+      training:
+        epochs: 300
+        verbose: True
+      synthesizing:
+        sample_size: 20000
+    shadow_models_output_path: ${results_dir}/ensemble_attack/shadow_models
+    target_model_output_path: ${results_dir}/shadow_target_model_and_data
+    training_json_config_paths: # Config json files used for tabddpm training on the trans table
+      table_domain_file_path: ${data_dir}/trans_domain.json
+      dataset_meta_file_path: ${data_dir}/dataset_meta.json
+      training_config_path: ${data_dir}/trans.json  # if this is not present, it will be created by copying the example config
+    fine_tuning_config:
+      fine_tune_diffusion_iterations: 200000
+      fine_tune_classifier_iterations: 20000
+      pre_train_data_size: 60000
+    number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
+
+    final_shadow_models_path: [
+      "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+      "${ensemble_attack.shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+      "${ensemble_attack.shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl",
+    ]
+    target_synthetic_data_path: ${ensemble_attack.shadow_training.target_model_output_path}/target_synthetic_data.csv
+
+  # Metaclassifier settings
+  metaclassifier:
+    # Data types json file is used for xgboost model training.
+    data_types_file_path: ${data_dir}/data_types.json
+    model_type: "xgb"
+    # Model training parameters
+    num_optuna_trials: 100 # Original code: 100
+    num_kfolds: 5
+    use_gpu: false
+    # Temporary. Might remove having an epoch parameter.
+    epochs: 1
+    meta_classifier_model_name: ${ensemble_attack.metaclassifier.model_type}_metaclassifier_model
+    metaclassifier_model_path: ${results_dir}/trained_models # Path where the trained metaclassifier model will be saved
+
+  target_model: # This is only used for testing the attack on a real target model.
+    target_synthetic_data_path: ${results_dir}/${table_name}/synthetic_data/${table_name}_synthetic.csv
+    challenge_data_path: ${ensemble_attack.data_paths.processed_attack_data_path}/${table_name}_challenge_data.csv
+    challenge_label_path: ${ensemble_attack.data_paths.processed_attack_data_path}/${table_name}_challenge_labels.npy
+
+    target_shadow_models_output_path: ${results_dir}/test_all_targets # Sub-directory to store test shadows and results
+    attack_probabilities_result_path: ${results_dir}/test_probabilities
+    attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase.
+    # See select_challenge_data_for_training()'s docstring for more details.
+
+evaluation:
+  # Reporting Configuration
+  write_report: True
+  metric_report_path: ${results_dir}/evaluation_results.txt
+
+  # Privacy Evaluation Configurations
+  dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  median_dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  hitting_rate:
+    run: True
+    hitting_threshold: 0.03
+
+  eir:
+    run: True
+    norm: "gower"
+
+  nndr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  # Quality Evaluation Configurations
+  ks_tv:
+    run: True
+    significance_level: 0.05
+    permutations: 1000
+
+  alpha_precision:
+    run: True
+    naive_only: False
+
+  ci_overlap:
+    run: True
+    confidence_level: 95
+
+  correlation_diff:
+    run: True
+    compute_mixed_correlations: True
+
+  mean_diff:
+    run: True
+
+  f1_score_diff:
+    run: True
+    label_column: "trans_type"
+    folds: 5
+    f1_type: "macro"
+
+  regression_score_diff:
+    run: True
+    label_column: "trans_type"
+    preprocess_labels: True
+    measure_metrics_in_original_label_space: False
+    verbose: True
+    convert_label_to_float: True
+
+  hellinger:
+    run: True
+    include_numerical_columns: True
+
+  propensity_mse:
+    run: True
+    folds: 5
+    max_iterations: 50
+    solver: "liblinear"
+
+  mutual_information:
+    run: True
+    include_numerical_columns: False
diff --git a/examples/tabsyn/ensemble_attack/make_challenge_dataset.py b/examples/tabsyn/ensemble_attack/make_challenge_dataset.py
new file mode 100644
index 00000000..987ba704
--- /dev/null
+++ b/examples/tabsyn/ensemble_attack/make_challenge_dataset.py
@@ -0,0 +1,40 @@
+from logging import INFO
+from pathlib import Path
+
+import hydra
+import numpy as np
+import pandas as pd
+from omegaconf import DictConfig
+
+from midst_toolkit.common.logger import log
+
+
+@hydra.main(config_path="./", config_name="config", version_base=None)
+def make_challenge_dataset(config: DictConfig) -> None:
+    """Main function to make the challenge dataset."""
+    log(INFO, "Making challenge dataset...")
+
+    real_data = pd.read_csv(Path(config.data_dir) / f"{config.table_name}.csv")
+
+    training_data = pd.read_csv(Path(config.results_dir) / f"{config.table_name}_sampled.csv")
+    id_column = f"{config.table_name}_id"
+    untrained_data = real_data[~real_data[id_column].isin(training_data[id_column])].sample(len(training_data))
+
+    challenge_data = pd.concat([training_data, untrained_data])
+    challenge_data_labels = np.concatenate([np.ones(len(training_data)), np.zeros(len(untrained_data))])
+
+    processed_attack_data_path = Path(config.ensemble_attack.data_paths.processed_attack_data_path)
+    processed_attack_data_path.mkdir(parents=True, exist_ok=True)
+
+    challenge_data_path = processed_attack_data_path / f"{config.table_name}_challenge_data.csv"
+    challenge_label_path = processed_attack_data_path / f"{config.table_name}_challenge_labels.npy"
+    log(INFO, f"Saving challenge data to {challenge_data_path}")
+    challenge_data.to_csv(challenge_data_path, index=False)
+    log(INFO, f"Saving challenge labels to {challenge_label_path}")
+    np.save(challenge_label_path, challenge_data_labels)
+
+    log(INFO, "Done!")
+
+
+if __name__ == "__main__":
+    make_challenge_dataset()
diff --git a/examples/tabsyn/ensemble_attack/run_test_attack_model.py b/examples/tabsyn/ensemble_attack/run_test_attack_model.py
new file mode 100644
index 00000000..50b52fb1
--- /dev/null
+++ b/examples/tabsyn/ensemble_attack/run_test_attack_model.py
@@ -0,0 +1,32 @@
+from logging import INFO
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.ensemble_attack.test_attack_model import run_metaclassifier_testing
+from examples.gan.ensemble_attack.utils import make_training_config
+from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabSynModelRunner
+from midst_toolkit.common.logger import log
+
+
+@hydra.main(config_path="./", config_name="config", version_base=None)
+def attack_model_test(config: DictConfig) -> None:
+    """
+    Main function to test the attack model.
+
+    Args:
+        config: The configuration object from the config.yaml file.
+    """
+    log(
+        INFO,
+        f"Testing attack model against synthetic data at {config.ensemble_attack.target_model.target_synthetic_data_path}...",
+    )
+
+    make_training_config(config)
+    model_runner = EnsembleAttackTabSynModelRunner(config)
+
+    run_metaclassifier_testing(model_runner, config.ensemble_attack)
+
+
+if __name__ == "__main__":
+    attack_model_test()
diff --git a/examples/tabsyn/ensemble_attack/train_attack_model.py b/examples/tabsyn/ensemble_attack/train_attack_model.py
new file mode 100644
index 00000000..8914858d
--- /dev/null
+++ b/examples/tabsyn/ensemble_attack/train_attack_model.py
@@ -0,0 +1,88 @@
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.ensemble_attack.run_metaclassifier_training import run_metaclassifier_training
+from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training, run_target_model_training
+from examples.gan.ensemble_attack.utils import get_master_challenge_train_data
+from midst_toolkit.attacks.ensemble.data_utils import load_dataframe, save_dataframe
+from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabSynModelRunner
+from midst_toolkit.attacks.ensemble.process_split_data import process_split_data
+from midst_toolkit.common.logger import log
+from midst_toolkit.common.random import set_all_random_seeds
+
+
+@hydra.main(config_path="./", config_name="config", version_base=None)
+def train_attack_model(config: DictConfig) -> None:
+    """
+    Train the Ensemble Attack pipeline with CTGAN model.
+
+    As the first step, data processing is done.
+    Second step is shadow model training used for RMIA attack.
+    Third step is metaclassifier training and evaluation.
+
+    Args:
+        config: Attack configuration as an OmegaConf DictConfig object.
+    """
+    if config.ensemble_attack.random_seed is not None:
+        set_all_random_seeds(seed=config.ensemble_attack.random_seed)
+        log(INFO, f"Training phase random seed set to {config.ensemble_attack.random_seed}.")
+
+    if config.ensemble_attack.pipeline.run_data_processing:
+        log(INFO, "Running data processing pipeline...")
+        # The following function saves the required dataframe splits in the specified processed_attack_data_path path.
+        population_data = load_dataframe(
+            Path(config.ensemble_attack.data_paths.population_path),
+            config.data_file_name,
+        )
+
+        # Removing id columns and saving the dataset
+        id_columns = [c for c in population_data.columns if c.endswith("_id")]
+        population_data_no_id = population_data.drop(columns=id_columns)
+        save_dataframe(
+            population_data_no_id,
+            Path(config.ensemble_attack.data_paths.population_path),
+            f"{Path(config.data_file_name).stem}_no_id.csv",
+        )
+
+        process_split_data(
+            all_population_data=population_data,
+            processed_attack_data_path=Path(config.ensemble_attack.data_paths.processed_attack_data_path),
+            # TODO: column_to_stratify value is not documented in the original codebase.
+            column_to_stratify=config.ensemble_attack.data_processing_config.column_to_stratify,
+            num_total_samples=config.ensemble_attack.data_processing_config.population_sample_size,
+            random_seed=config.ensemble_attack.random_seed,
+        )
+
+    if config.ensemble_attack.pipeline.run_shadow_model_training:
+        log(INFO, "Training the shadow models...")
+
+        model_runner = EnsembleAttackTabSynModelRunner(config)
+
+        master_challenge_train = get_master_challenge_train_data(config)
+        shadow_data_paths = run_shadow_model_training(model_runner, config.ensemble_attack, master_challenge_train)
+
+        log(INFO, "Training the target model...")
+        target_model_synthetic_path = run_target_model_training(model_runner, config.ensemble_attack)
+
+    if config.ensemble_attack.pipeline.run_metaclassifier_training:
+        log(INFO, "Training the metaclassifier...")
+        if not config.ensemble_attack.pipeline.run_shadow_model_training:
+            # If shadow model training is skipped, we need to provide the previous shadow model and target model paths.
+            shadow_data_paths = [
+                Path(path) for path in config.ensemble_attack.shadow_training.final_shadow_models_path
+            ]
+            target_model_synthetic_path = Path(config.ensemble_attack.shadow_training.target_synthetic_data_path)
+
+        assert len(shadow_data_paths) == 3, "The attack_data_paths list must contain exactly three elements."
+        assert target_model_synthetic_path is not None, (
+            "The target_data_path must be provided for metaclassifier training."
+        )
+
+        run_metaclassifier_training(config.ensemble_attack, shadow_data_paths, target_model_synthetic_path)
+
+
+if __name__ == "__main__":
+    train_attack_model()
diff --git a/examples/tabsyn/evaluate.py b/examples/tabsyn/evaluate.py
new file mode 100644
index 00000000..5853bf74
--- /dev/null
+++ b/examples/tabsyn/evaluate.py
@@ -0,0 +1,449 @@
+from logging import INFO
+from pathlib import Path
+from typing import Any
+
+import hydra
+import pandas as pd
+from omegaconf import DictConfig
+
+from examples.midst_evaluation.preprocessing import (
+    get_numerical_and_categorical_column_names,
+    preprocess_data_for_alpha_precision_eval,
+    syntheval_preprocess,
+)
+from midst_toolkit.common.logger import log
+from midst_toolkit.data_processing.midst_data_processing import load_midst_data_with_test
+from midst_toolkit.evaluation.metrics_base import MetricBase
+from midst_toolkit.evaluation.privacy import (
+    DistanceToClosestRecordScore,
+    EpsilonIdentifiabilityRisk,
+    HittingRate,
+    MedianDistanceToClosestRecordScore,
+    NearestNeighborDistanceRatio,
+)
+from midst_toolkit.evaluation.privacy.distance_preprocess import preprocess_for_distance_computation
+from midst_toolkit.evaluation.privacy.distance_utils import NormType
+from midst_toolkit.evaluation.privacy.epsilon_identifiability_risk import EpsilonIdentifiabilityNorm
+from midst_toolkit.evaluation.quality import (
+    AlphaPrecision,
+    CorrelationMatrixDifference,
+    DimensionwiseMeanDifference,
+    KolmogorovSmirnovAndTotalVariation,
+    MeanConfidenceIntervalOverlap,
+    MeanF1ScoreDifference,
+    MeanHellingerDistance,
+    MeanPropensityMeanSquaredError,
+    MeanRegressionDifference,
+    MutualInformationDifference,
+)
+from midst_toolkit.evaluation.quality.confidence_interval_overlap import ConfidenceLevel
+
+
+SEPARATOR = "-" * 80
+
+
+def log_metrics(header: str, results: dict[str, float]) -> None:
+    """
+    Helper function to log metrics associated with the results dictionary in a structured fashion. The header
+    is used to separate out different families of metrics in the output.
+
+    Args:
+        header: String to describe the set of metrics that will be logged.
+        results: Dictionary of metric names (keys) and metric values (values) to be logged.
+    """
+    log(INFO, f"\n{header}\n{SEPARATOR}\n")
+    for metric_name, metric_value in results.items():
+        log(INFO, rf"Metric: {metric_name}\Score: {metric_value}")
+    log(INFO, f"{SEPARATOR}\n")
+
+
+def write_metrics(metric_report_path: Path, header: str, results: dict[str, float]) -> None:
+    """
+    Helper function to write metrics associated with the results dictionary in a structured fashion to a file. The
+    header is used to separate out different families of metrics in the output.
+
+    Args:
+        metric_report_path: Path to with the metrics will be written
+        header: String to describe the set of metrics that will be written.
+        results: Dictionary of metric names (keys) and metric values (values) to be written.
+    """
+    with open(metric_report_path, "a") as f:
+        f.write(f"\n{header}\n{SEPARATOR}\n")
+        for metric_name, metric_value in results.items():
+            f.write(f"Metric: {metric_name:40}Metric: {metric_value}\n")
+        f.write(f"{SEPARATOR}\n")
+
+
+def report_metrics(cfg: DictConfig, header: str, results: dict[str, float]) -> None:
+    """
+    A helper function facilitate both logging and, optionally depending on the configuration settings, write the
+    metrics results to a file.
+
+    Args:
+        cfg: Configuration determining if and where to write metrics to a file.
+        header: String to describe the set of metrics that will be logged/written.
+        results: Dictionary of metric names (keys) and metric values (values) to be logged/written.
+    """
+    log_metrics(header, results)
+    write_metrics(Path(cfg.metric_report_path), header, results)
+
+
+def remove_label_column_from_other_columns(
+    label_column: str, numerical_columns: list[str], categorical_columns: list[str]
+) -> tuple[list[str], list[str]]:
+    """
+    Given a column name for a target label (task label), this ensures that the label is removed from either the
+    numerical or categorical columns list. During preprocessing, it is advantageous to also process the label
+    column into, for instance, and ordinal value. However, when performing F1 measurements, we no longer want it
+    to be part of the dataframe during training.
+
+    Args:
+        label_column: Column name associated with task labels of interest.
+        numerical_columns: Set of column names associated with numerical values.
+        categorical_columns: Set of column names associated with categorical values.
+
+    Raises:
+        ValueError: Will throw an error if the label column is present in both column names lists, which is bad...
+
+    Returns:
+        Filtered copies of the numerical and categorical column names without the specified label column included.
+    """
+    if label_column in numerical_columns and label_column in categorical_columns:
+        raise ValueError("Label column appears in both types of columns...")
+    return [item for item in numerical_columns if label_column != item], [
+        item for item in categorical_columns if label_column != item
+    ]
+
+
+def should_syntheval_preprocess(cfg: DictConfig, for_privacy: bool) -> bool:
+    """
+    Determines whether, based on the configuration, syntheval preprocessing needs to be performed.
+
+    Args:
+        cfg: Configuration with the settings for the entire evaluation pipeline.
+        for_privacy: Whether this is for the privacy evaluations (True) or for the quality evaluations (False).
+
+    Returns:
+        True if preprocessing with the SynthEval pipeline should be performed, False otherwise
+    """
+    if for_privacy:
+        return any([cfg.hitting_rate.run, cfg.eir.run])
+    return any(
+        [
+            cfg.ks_tv.run,
+            cfg.ci_overlap.run,
+            cfg.ks_tv.run,
+            cfg.correlation_diff.run,
+            cfg.mean_diff.run,
+            cfg.f1_score_diff.run,
+            cfg.regression_score_diff.run,
+            cfg.hellinger.run,
+            cfg.propensity_mse.run,
+            cfg.mutual_information.run,
+        ]
+    )
+
+
+def run_quality_evaluations(
+    # ruff: noqa: PLR0915
+    cfg: DictConfig,
+    real_data_train: pd.DataFrame,
+    synthetic_data: pd.DataFrame,
+    real_data_test: pd.DataFrame,
+    meta_info: dict[str, Any],
+) -> None:
+    """
+    Run quality evaluation metrics.
+
+    Args:
+        cfg: Configuration information for the evaluation.
+        real_data_train: Dataframe containing real data to which the synthetic data will be compared (typically used to
+            train the model that generation the synthetic data).
+        synthetic_data: Dataframe containing the synthetic data whose quality is to be measured.
+        real_data_test: Dataframe containing real data to which the synthetic data will be compared (typically
+            explicitly NOT USED to train the model that generation the synthetic data).
+        meta_info: Dictionary containing information about the dataframes, including which columns correspond to
+            numerical and categorical values or whether there is a column corresponding to a label.
+    """
+    metric: MetricBase
+    if cfg.alpha_precision.run:
+        log(INFO, "Preprocessing Data for Alpha Precision Evaluation")
+        # Categorical values are one-hot encoded, numerical values are left alone.
+        alpha_precision_real_data, alpha_precision_synthetic_data = preprocess_data_for_alpha_precision_eval(
+            real_data=real_data_train, synthetic_data=synthetic_data, meta_info=meta_info
+        )
+        log(INFO, "Running Alpha-Precision Evaluation")
+        metric = AlphaPrecision(naive_only=cfg.alpha_precision.naive_only)
+        results = metric.compute(alpha_precision_real_data, alpha_precision_synthetic_data)
+        report_metrics(cfg, "ALPHA PRECISION", results)
+
+    # Shared preprocessing for syntheval based metrics if they are to be run
+    if should_syntheval_preprocess(cfg, for_privacy=False):
+        log(INFO, "Preprocessing Data with SynthEval pipeline")
+        numerical_columns, categorical_columns = get_numerical_and_categorical_column_names(real_data_train, meta_info)
+
+        # Categorical values are ordinal encoded, numerical values are min-max scaled
+        syntheval_real_data_train, syntheval_synthetic_data, syntheval_real_data_test = syntheval_preprocess(
+            numerical_columns, categorical_columns, real_data_train, synthetic_data, real_data_test
+        )
+
+    if cfg.ks_tv.run:
+        log(INFO, "Running Kolmogorov-Smirnov and Total Variation Evaluation")
+        metric = KolmogorovSmirnovAndTotalVariation(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            significance_level=cfg.ks_tv.significance_level,
+            permutations=cfg.ks_tv.permutations,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "KOLMOGOROV SMIRNOV AND TOTAL VARIATION", results)
+
+    if cfg.ci_overlap.run:
+        log(INFO, "Running Confidence Interval Overlap Evaluation")
+        metric = MeanConfidenceIntervalOverlap(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            confidence_level=ConfidenceLevel(cfg.ci_overlap.confidence_level),
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "CONFIDENCE INTERVAL OVERLAP", results)
+
+    if cfg.correlation_diff.run:
+        log(INFO, "Running Mean Correlation Matrix Difference Evaluation")
+        metric = CorrelationMatrixDifference(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            compute_mixed_correlations=cfg.correlation_diff.compute_mixed_correlations,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "CORRELATION MATRIX DIFFERENCE", results)
+
+    if cfg.mean_diff.run:
+        log(INFO, "Running Dimensionwise Mean Difference Evaluation")
+        metric = DimensionwiseMeanDifference(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "DIMENSIONWISE MEAN DIFFERENCE", results)
+
+    if cfg.f1_score_diff.run:
+        # Explicitly removing the target/label column from other column names
+        label_column = cfg.f1_score_diff.label_column
+        filtered_numerical_columns, filtered_categorical_columns = remove_label_column_from_other_columns(
+            label_column, numerical_columns, categorical_columns
+        )
+        log(INFO, "Running F1 Score Difference Evaluation")
+        metric = MeanF1ScoreDifference(
+            categorical_columns=filtered_categorical_columns,
+            numerical_columns=filtered_numerical_columns,
+            label_column=label_column,
+            folds=cfg.f1_score_diff.folds,
+            f1_type=cfg.f1_score_diff.f1_type,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data, syntheval_real_data_test)
+        report_metrics(cfg, "F1 SCORE DIFFERENCE", results)
+
+    if cfg.regression_score_diff.run:
+        # Explicitly removing the target/label column from other column names
+        label_column = cfg.regression_score_diff.label_column
+        filtered_numerical_columns, filtered_categorical_columns = remove_label_column_from_other_columns(
+            label_column, numerical_columns, categorical_columns
+        )
+        log(INFO, "Running Regression Score Difference Evaluation")
+        metric = MeanRegressionDifference(
+            categorical_columns=filtered_categorical_columns,
+            numerical_columns=filtered_numerical_columns,
+            label_column=label_column,
+            preprocess_labels=cfg.regression_score_diff.preprocess_labels,
+            include_additional_metrics=cfg.regression_score_diff.verbose,
+            # Regression has it's own preprocessing pipeline
+            do_preprocess=True,
+            measure_metrics_in_original_label_space=cfg.regression_score_diff.measure_metrics_in_original_label_space,
+        )
+
+        real_data_train_copy = real_data_train.copy()
+        synthetic_data_copy = synthetic_data.copy()
+        real_data_test_copy = real_data_test.copy()
+        if cfg.regression_score_diff.convert_label_to_float:
+            real_data_train_copy[label_column] = real_data_train_copy[label_column].astype(float)
+            synthetic_data_copy[label_column] = synthetic_data_copy[label_column].astype(float)
+            real_data_test_copy[label_column] = real_data_test_copy[label_column].astype(float)
+
+        results = metric.compute(real_data_train_copy, synthetic_data_copy, real_data_test_copy)
+        report_metrics(cfg, "REGRESSION SCORE DIFFERENCE", results)
+
+    if cfg.hellinger.run:
+        log(INFO, "Running Hellinger Distance Difference Evaluation")
+        metric = MeanHellingerDistance(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            include_numerical_columns=cfg.hellinger.include_numerical_columns,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "HELLINGER DISTANCE DIFFERENCE", results)
+
+    if cfg.propensity_mse.run:
+        log(INFO, "Running Propensity Mean Squared Error Evaluation")
+        metric = MeanPropensityMeanSquaredError(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            folds=cfg.propensity_mse.folds,
+            max_iterations=cfg.propensity_mse.max_iterations,
+            solver=cfg.propensity_mse.solver,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "PROPENSITY MEAN SQUARED ERROR", results)
+
+    if cfg.mutual_information.run:
+        log(INFO, "Running Mutual Information Difference Evaluation")
+        metric = MutualInformationDifference(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            include_numerical_columns=cfg.mutual_information.include_numerical_columns,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "MUTUAL INFORMATION DIFFERENCE", results)
+
+
+def run_privacy_evaluations(
+    cfg: DictConfig,
+    real_data_train: pd.DataFrame,
+    synthetic_data: pd.DataFrame,
+    real_data_test: pd.DataFrame,
+    meta_info: dict[str, Any],
+) -> None:
+    """
+    Run Privacy evaluation metrics.
+
+    Args:
+        cfg: Configuration information for the evaluation.
+        real_data_train: Dataframe containing real data to which the synthetic data will be compared (typically used to
+            train the model that generation the synthetic data).
+        synthetic_data: Dataframe containing the synthetic data whose quality is to be measured.
+        real_data_test: Dataframe containing real data to which the synthetic data will be compared (typically
+            explicitly NOT USED to train the model that generation the synthetic data).
+        meta_info: Dictionary containing information about the dataframes, including which columns correspond to
+            numerical and categorical values or whether there is a column corresponding to a label.
+    """
+    metric: MetricBase
+    # Shared preprocessing for syntheval based metrics if they are to be run
+    if should_syntheval_preprocess(cfg, for_privacy=True):
+        log(INFO, "Preprocessing Data with SynthEval pipeline")
+        numerical_columns, categorical_columns = get_numerical_and_categorical_column_names(real_data_train, meta_info)
+
+        # Categorical values are ordinal encoded, numerical values are min-max scaled
+        syntheval_real_data_train, syntheval_synthetic_data = syntheval_preprocess(
+            numerical_columns, categorical_columns, real_data_train, synthetic_data
+        )
+
+    if cfg.hitting_rate.run:
+        log(INFO, "Running Hitting Rate Evaluation")
+
+        metric = HittingRate(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            hitting_threshold=cfg.hitting_rate.hitting_threshold,
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "HITTING RATE", results)
+
+    if cfg.eir.run:
+        log(INFO, "Running Epsilon Identifiability Rate Evaluation")
+        # Categorical values are ordinal encoded, numerical values are min-max scaled
+        metric = EpsilonIdentifiabilityRisk(
+            categorical_columns=categorical_columns,
+            numerical_columns=numerical_columns,
+            norm=EpsilonIdentifiabilityNorm(cfg.eir.norm),
+            # Already preprocessing above
+            do_preprocess=False,
+        )
+        results = metric.compute(syntheval_real_data_train, syntheval_synthetic_data)
+        report_metrics(cfg, "EPSILON IDENTIFIABILITY RISK", results)
+
+    # Shared preprocessing for distance based metrics, if they are to be run.
+    if any([cfg.dcr.run, cfg.median_dcr.run, cfg.nndr.run]):
+        log(INFO, "Preprocessing Data for Distance Evaluation")
+        # Categorical values are one-hot encoded, numerical values are scaled by their range, but not into [0,1]
+        distance_real_data, distance_synthetic_data, distance_holdout_data = preprocess_for_distance_computation(
+            meta_info=meta_info,
+            real_data_train=real_data_train,
+            synthetic_data=synthetic_data,
+            real_data_test=real_data_test,
+        )
+
+    if cfg.dcr.run:
+        log(INFO, "Running DCR Evaluation")
+        metric = DistanceToClosestRecordScore(NormType(cfg.dcr.norm), cfg.dcr.batch_size, do_preprocess=False)
+        results = metric.compute(distance_real_data, distance_synthetic_data, distance_holdout_data)
+        report_metrics(cfg, "DISTANCE TO CLOSEST RECORD", results)
+
+    if cfg.median_dcr.run:
+        log(INFO, "Running Median DCR Evaluation")
+        metric = MedianDistanceToClosestRecordScore(
+            NormType(cfg.median_dcr.norm), cfg.median_dcr.batch_size, do_preprocess=False
+        )
+        results = metric.compute(distance_real_data, distance_synthetic_data)
+        report_metrics(cfg, "MEDIAN DISTANCE TO CLOSEST RECORD", results)
+
+    if cfg.nndr.run:
+        log(INFO, "Running NNDR Evaluation")
+        metric = NearestNeighborDistanceRatio(NormType(cfg.nndr.norm), cfg.nndr.batch_size, do_preprocess=False)
+        results = metric.compute(distance_real_data, distance_synthetic_data, distance_holdout_data)
+        report_metrics(cfg, "NEAREST NEIGHBOR DISTANCE RATIO", results)
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def main(cfg: DictConfig) -> None:
+    """Entry point for the evaluation script."""
+    log(INFO, "Loading Data for Evaluations")
+
+    full_data_path = Path(cfg.data_dir) / f"{cfg.table_name}.csv"
+    train_data_path = Path(cfg.data_dir) / f"{cfg.table_name}_sampled.csv"
+    synthetic_data_path = Path(cfg.results_dir) / cfg.table_name / "synthetic_data" / f"{cfg.table_name}_synthetic.csv"
+    meta_data_path = Path(cfg.data_dir) / f"{cfg.table_name}_info.json"
+
+    log(INFO, "Making the test dataset...")
+
+    full_dataset = pd.read_csv(full_data_path)
+    sampled_dataset = pd.read_csv(train_data_path)
+    merged_dataset = full_dataset.merge(sampled_dataset, how="left", indicator=True)
+    complement_dataset = merged_dataset.loc[merged_dataset["_merge"] == "left_only"].drop(columns="_merge")
+    test_dataset = complement_dataset.sample(n=len(sampled_dataset))
+
+    test_data_path = Path(cfg.data_dir) / f"{cfg.table_name}_test.csv"
+    test_dataset.to_csv(test_data_path, index=False)
+
+    log(INFO, "Loading the data...")
+
+    real_data_train, synthetic_data, real_data_test, meta_info = load_midst_data_with_test(
+        train_data_path,
+        synthetic_data_path,
+        meta_data_path,
+        test_data_path,
+    )
+
+    run_quality_evaluations(cfg.evaluation, real_data_train, synthetic_data, real_data_test, meta_info)
+    run_privacy_evaluations(cfg.evaluation, real_data_train, synthetic_data, real_data_test, meta_info)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tabsyn/synthesize.py b/examples/tabsyn/synthesize.py
new file mode 100644
index 00000000..49a8667c
--- /dev/null
+++ b/examples/tabsyn/synthesize.py
@@ -0,0 +1,115 @@
+import json
+from logging import INFO
+from pathlib import Path
+
+import hydra
+from omegaconf import DictConfig
+
+from examples.tabsyn.train import train_tabsyn
+from midst_toolkit.common.logger import log
+from midst_toolkit.common.variables import DEVICE
+from midst_toolkit.models.tabsyn.config import load_config
+from midst_toolkit.models.tabsyn.dataset import preprocess
+from midst_toolkit.models.tabsyn.pipeline import TabSyn
+from midst_toolkit.models.tabsyn.preprocessing import get_processed_data_dir
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def tabsyn_synthesize(config: DictConfig) -> None:
+    """
+    Synthesize data using the TabSyn model.
+
+    Args:
+        config: Configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, "Synthesizing data using the TabSyn model...")
+
+    results_dir = Path(config.results_dir)
+    model_save_dir = results_dir / config.table_name
+    vae_save_dir = model_save_dir / "vae"
+
+    if not (model_save_dir / "model.pt").exists() or not (vae_save_dir / "model.pt").exists():
+        log(INFO, "Trained model not found. Training the model...")
+        train_tabsyn(config)
+
+    else:
+        log(INFO, "Trained model found.")
+
+    # The preprocess function below expects 2 folders of preprocessed data:
+    # 1. the dataset to be processed, which can be a subset of the full dataset
+    # 2. the full dataset, which is used to unsure it will get all the categories for the categorical features
+    # Here we are making the dataset #2 (full dataset) by copying the dataset #1
+    dataset_path = get_processed_data_dir(Path(config.data_dir)) / config.table_name
+    ref_dataset_path = Path(f"{dataset_path}_all")
+
+    tabsyn_config = load_config(Path(config.tabsyn_config))
+
+    log(INFO, "Instantiating the TabSyn model...")
+
+    _, _, categories, d_numerical = preprocess(  # type: ignore[misc]
+        dataset_path=dataset_path,
+        ref_dataset_path=ref_dataset_path,
+        transforms=tabsyn_config["transforms"],
+        task_type=tabsyn_config["task_type"],
+    )
+
+    # Instantiate an empty model object so we can load the model state from the checkpoint
+    # TODO: Refactor this constructor to allow for this use case without having to ignore the type checker
+    tabsyn = TabSyn(None, None, None, d_numerical, categories, device=DEVICE)  # type: ignore[arg-type]
+
+    ###### Load the model ######
+
+    # instantiate VAE model
+    tabsyn.instantiate_vae(**tabsyn_config["model_params"], optim_params=None)
+
+    # load latent embeddings attributes of input data
+    train_z_att = tabsyn.load_embeddings_attributes(vae_save_dir)
+    token_dim = train_z_att["token_dim"]
+    in_dim = train_z_att["in_dim"]
+
+    # instantiate diffusion model
+    tabsyn.instantiate_diffusion(in_dim=in_dim, optim_params=None)
+
+    # load state from checkpoint
+    tabsyn.load_model_state(ckpt_dir=model_save_dir, dif_ckpt_name="model.pt")
+
+    ###### Synthesize data ######
+
+    # get inverse tokenizers
+    # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here
+    _, _, _, _, num_inverse, cat_inverse = preprocess(  # type: ignore[misc]
+        dataset_path=dataset_path,
+        ref_dataset_path=ref_dataset_path,
+        transforms=tabsyn_config["transforms"],
+        task_type=tabsyn_config["task_type"],
+        inverse=True,
+    )
+
+    synthetic_data_dir = results_dir / config.table_name / "synthetic_data"
+    synthetic_data_dir.mkdir(parents=True, exist_ok=True)
+
+    # load data info file
+    with open(dataset_path / "info.json", "r") as file:
+        data_info = json.load(file)
+
+    data_info["token_dim"] = token_dim
+
+    # sample data
+    num_samples = train_z_att["num_samples"]
+    in_dim = train_z_att["in_dim"]
+    mean_input_emb = train_z_att["mean_input_emb"]
+    tabsyn.sample(
+        num_samples,
+        in_dim,
+        mean_input_emb,
+        info=data_info,
+        num_inverse=num_inverse,
+        cat_inverse=cat_inverse,
+        save_path=synthetic_data_dir / f"{config.table_name}_synthetic.csv",
+    )
+
+    log(INFO, "Synthesizing Done!")
+
+
+if __name__ == "__main__":
+    tabsyn_synthesize()
diff --git a/examples/tabsyn/tabsyn_config.toml b/examples/tabsyn/tabsyn_config.toml
new file mode 100644
index 00000000..6b31d7f6
--- /dev/null
+++ b/examples/tabsyn/tabsyn_config.toml
@@ -0,0 +1,42 @@
+task_type = "regression"
+
+[model_params]
+n_head = 1
+factor = 32
+num_layers = 2
+d_token = 4
+
+[transforms]
+normalization = "quantile"
+numerical_nan_policy = "mean"
+categorical_nan_policy = "__none__"
+category_minimum_frequency = "__none__"
+categorical_encoding = "__none__"
+target_policy = "default"
+
+[train.vae]
+num_epochs = 4000
+batch_size = 4096
+num_dataset_workers = 4
+
+[train.diffusion]
+num_epochs = 10001
+batch_size = 4096
+num_dataset_workers = 4
+
+[train.optim.vae]
+lr = 1e-3
+weight_decay = 0
+factor = 0.95
+patience = 10
+
+[train.optim.diffusion]
+lr = 1e-3
+weight_decay = 0
+factor = 0.9
+patience = 20
+
+[loss_params]
+max_beta = 1e-2
+min_beta = 1e-5
+lambd = 0.7
diff --git a/examples/tabsyn/train.py b/examples/tabsyn/train.py
new file mode 100644
index 00000000..66ec8940
--- /dev/null
+++ b/examples/tabsyn/train.py
@@ -0,0 +1,185 @@
+import shutil
+from logging import INFO
+from pathlib import Path
+
+import hydra
+import pandas as pd
+import torch
+from omegaconf import DictConfig
+from torch import Tensor
+from torch.utils.data import DataLoader
+
+from midst_toolkit.common.enumerations import DataSplit
+from midst_toolkit.common.logger import log
+from midst_toolkit.common.variables import DEVICE
+from midst_toolkit.models.tabsyn.config import load_config
+from midst_toolkit.models.tabsyn.dataset import TabularDataset, preprocess
+from midst_toolkit.models.tabsyn.pipeline import TabSyn
+from midst_toolkit.models.tabsyn.preprocessing import get_processed_data_dir, process_data
+
+
+@hydra.main(config_path=".", config_name="config", version_base=None)
+def train_tabsyn(config: DictConfig) -> None:
+    """
+    Train a TabSyn model.
+
+    Args:
+        config: Configuration as an OmegaConf DictConfig object.
+    """
+    log(INFO, "Training TabSyn model...")
+    results_dir = Path(config.results_dir)
+    results_dir.mkdir(parents=True, exist_ok=True)
+
+    data_name = _sample_data_if_needed(
+        config.table_name,
+        Path(config.data_dir),
+        Path(config.results_dir),
+        config.get("training", {}).get("sample_size", None),
+    )
+    process_data(config.table_name, Path(config.data_dir), Path(config.data_dir), data_name=data_name)
+
+    tabsyn_config = load_config(Path(config.tabsyn_config))
+
+    # The preprocess function below expects 2 folders of preprocessed data:
+    # 1. the dataset to be processed, which can be a subset of the full dataset
+    # 2. the full dataset, which is used to unsure it will get all the categories for the categorical features
+    # Here we are making the dataset #2 (full dataset) by copying the dataset #1
+    dataset_path = get_processed_data_dir(Path(config.data_dir)) / config.table_name
+    ref_dataset_path = Path(f"{dataset_path}_all")
+    shutil.rmtree(ref_dataset_path, ignore_errors=True)
+    shutil.copytree(dataset_path, ref_dataset_path)
+
+    log(INFO, "Preprocessing data...")
+
+    # preprocess the data
+    # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here
+    numerical_features, categorical_features, categories, d_numerical = preprocess(  # type: ignore[misc]
+        dataset_path=dataset_path,
+        ref_dataset_path=ref_dataset_path,
+        transforms=tabsyn_config["transforms"],
+        task_type=tabsyn_config["task_type"],
+    )
+
+    # separate train and test data
+    numerical_features_train = numerical_features[DataSplit.TRAIN.value]
+    numerical_features_test = numerical_features[DataSplit.TEST.value]
+    categorical_features_train = categorical_features[DataSplit.TRAIN.value]
+    categorical_features_test = categorical_features[DataSplit.TEST.value]
+
+    # convert to float tensor
+    numerical_features_train = torch.tensor(numerical_features_train).float()
+    numerical_features_test = torch.tensor(numerical_features_test).float()
+    categorical_features_train = torch.tensor(categorical_features_train)
+    categorical_features_test = torch.tensor(categorical_features_test)
+
+    log(INFO, "Loading the dataset...")
+
+    # create dataset module
+    train_data = TabularDataset(numerical_features_train.float(), categorical_features_train)
+
+    # move test data to gpu if available
+    numerical_features_test = numerical_features_test.float().to(DEVICE)
+    categorical_features_test = categorical_features_test.to(DEVICE)
+
+    # create train dataloader
+    train_loader: DataLoader[TabularDataset] = DataLoader[TabularDataset](
+        # Ignoring here because this is expecting the dataset to be subclass of torch's Dataset but it isn't
+        train_data,  # type: ignore[arg-type]
+        batch_size=tabsyn_config["train"]["vae"]["batch_size"],
+        shuffle=True,
+        num_workers=tabsyn_config["train"]["vae"]["num_dataset_workers"],
+    )
+
+    log(INFO, "Instantiating the TabSyn model...")
+
+    # Instantiate the model
+    tabsyn = TabSyn(
+        train_loader,
+        numerical_features_test,
+        categorical_features_test,
+        num_numerical_features=d_numerical,
+        num_classes=categories,
+        device=DEVICE,
+    )
+
+    model_save_dir = results_dir / config.table_name
+    vae_save_dir = model_save_dir / "vae"
+    vae_save_dir.mkdir(parents=True, exist_ok=True)
+
+    ###### A. Train the VAE model ######
+
+    log(INFO, "Training the TabSyn VAE model...")
+
+    # instantiate VAE model for training
+    tabsyn.instantiate_vae(
+        **tabsyn_config["model_params"],
+        optim_params=tabsyn_config["train"]["optim"]["vae"],
+    )
+
+    tabsyn.train_vae(
+        **tabsyn_config["loss_params"],
+        num_epochs=tabsyn_config["train"]["vae"]["num_epochs"],
+        save_path=vae_save_dir,
+    )
+
+    # embed all inputs in the latent space
+    tabsyn.save_vae_embeddings(
+        numerical_features_train,
+        categorical_features_train,
+        vae_ckpt_dir=vae_save_dir,
+    )
+    tabsyn.save_embeddings_attributes(vae_ckpt_dir=vae_save_dir)
+
+    ###### B. Train the Diffusion model ######
+
+    log(INFO, "Training the TabSyn Diffusion model...")
+
+    # load latent space embeddings
+    train_z, _ = tabsyn.load_latent_embeddings(vae_save_dir)  # train_z dim: B x in_dim
+
+    # normalize embeddings
+    latent_train_data = (train_z - train_z.mean(0)) / 2
+
+    # create data loader
+    latent_train_loader: DataLoader[Tensor] = DataLoader[Tensor](
+        # Ignoring the type checker here because our code in tabsyn.train_diffusion
+        # works with plain Tensor and not with TensorDataset
+        latent_train_data,  # type: ignore[arg-type]
+        batch_size=tabsyn_config["train"]["diffusion"]["batch_size"],
+        shuffle=True,
+        num_workers=tabsyn_config["train"]["diffusion"]["num_dataset_workers"],
+    )
+
+    # instantiate diffusion model for training
+    tabsyn.instantiate_diffusion(
+        in_dim=train_z.shape[1],
+        optim_params=tabsyn_config["train"]["optim"]["diffusion"],
+    )
+
+    # train diffusion model
+    tabsyn.train_diffusion(
+        latent_train_loader,
+        num_epochs=tabsyn_config["train"]["diffusion"]["num_epochs"],
+        ckpt_path=model_save_dir,
+    )
+
+    log(INFO, "Training Done!")
+
+
+def _sample_data_if_needed(table_name: str, data_dir: Path, results_dir: Path, sample_size: int | None) -> str:
+    data_name = table_name
+    if sample_size:
+        log(INFO, f"Sampling {sample_size} rows from data...")
+        all_data = pd.read_csv(data_dir / f"{table_name}.csv")
+        sampled_data = all_data.sample(n=sample_size)
+        results_dir.mkdir(parents=True, exist_ok=True)
+
+        data_name = f"{table_name}_sampled"
+        sampled_data.to_csv(results_dir / f"{data_name}.csv", index=False)
+        sampled_data.to_csv(data_dir / f"{data_name}.csv", index=False)
+
+    return data_name
+
+
+if __name__ == "__main__":
+    train_tabsyn()
diff --git a/src/midst_toolkit/attacks/ensemble/models.py b/src/midst_toolkit/attacks/ensemble/models.py
index 5326faa6..261d2211 100644
--- a/src/midst_toolkit/attacks/ensemble/models.py
+++ b/src/midst_toolkit/attacks/ensemble/models.py
@@ -2,19 +2,24 @@
 
 import copy
 import json
+import shutil
+import tempfile
 from abc import ABC, abstractmethod
 from logging import INFO
 from pathlib import Path
 from typing import Any
 
 import pandas as pd
+import torch
 from omegaconf import DictConfig
 from pydantic import BaseModel, ConfigDict
 from sdv.metadata import SingleTableMetadata
 from sdv.single_table import CTGANSynthesizer
+from torch.utils.data import DataLoader
 
 from midst_toolkit.attacks.ensemble.clavaddpm_fine_tuning import clava_fine_tuning
-from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, TrainingConfig
+from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig, GeneralConfig, TrainingConfig
+from midst_toolkit.common.enumerations import DataSplit
 from midst_toolkit.common.logger import log
 from midst_toolkit.common.variables import DEVICE
 from midst_toolkit.models.clavaddpm.clustering import clava_clustering
@@ -22,6 +27,10 @@
 from midst_toolkit.models.clavaddpm.enumerations import GroupLengthsProbDicts, Relation, RelationOrder
 from midst_toolkit.models.clavaddpm.synthesizer import clava_synthesizing
 from midst_toolkit.models.clavaddpm.train import ClavaDDPMModelArtifacts, CTGANModelArtifacts, clava_training
+from midst_toolkit.models.tabsyn.config import load_config
+from midst_toolkit.models.tabsyn.dataset import TabularDataset, preprocess
+from midst_toolkit.models.tabsyn.pipeline import TabSyn
+from midst_toolkit.models.tabsyn.preprocessing import get_processed_data_dir, process_data
 
 
 class EnsembleAttackTrainingResult(BaseModel):
@@ -343,3 +352,298 @@ def get_single_table_svd_metadata(
     metadata.remove_primary_key()
 
     return metadata, data_without_ids
+
+
+class EnsembleAttackTabSynModelRunner(EnsembleAttackModelRunner):
+    def __init__(self, config: DictConfig):
+        """
+        Initialize the ensemble attack model runner for the TabSyn model with a config dictionary.
+
+        Args:
+            config: The config from the config.yaml file for the ensemble attack model.
+                Must contain the following keys:
+                - tabsyn_config: The tabsyn config path for the TabSyn model.
+                - data_dir: The data directory for the TabSyn model.
+                - results_dir: The results directory for the TabSyn model.
+                - table_name: The name of the table the TabSyn model is being trained on.
+        """
+        self.tabsyn_config = load_config(config.tabsyn_config)
+        self.table_name = config.table_name
+        self.full_data_dir = Path(config.data_dir)
+        self.training_config = TrainingConfig(
+            general=GeneralConfig(
+                data_dir=self.full_data_dir,
+                test_data_dir=Path(f"{self.full_data_dir}-test"),
+                exp_name=f"{config.table_name}-ensemble-attack",
+                workspace_dir=Path(config.results_dir),
+                sample_prefix=f"{config.table_name}-ea",
+            ),
+            save_dir=Path(config.results_dir) / self.table_name,
+        )
+
+        log(INFO, "Processing TabSyn full dataset...")
+        process_data(self.table_name, self.full_data_dir, self.full_data_dir)
+
+    @property
+    def vae_save_dir(self) -> Path:
+        """
+        Get the directory to save the VAE model.
+
+        Will return the save directory relative to the current self.training_config.save_dir.
+        """
+        assert self.training_config.save_dir is not None, "Save dir is not set"
+        return self.training_config.save_dir / "vae"
+
+    def train_or_fine_tune_and_synthesize(
+        self,
+        dataset: pd.DataFrame,
+        synthesize: bool = True,
+        trained_model: EnsembleAttackTrainingResult | None = None,
+    ) -> EnsembleAttackTrainingResult:
+        """
+        Train or fine tune a TabSyn model on the provided dataset and optionally synthesize data.
+
+        Args:
+            dataset: The dataset as a pandas DataFrame.
+            synthesize: Flag indicating whether to generate synthetic data after training.
+                The number of points to synthesize and the save directory is controlled by
+                the `number_of_points_to_synthesize` and `save_dir` attributes of the training
+                config. Optional, default is True.
+            trained_model: The trained model to fine tune. If None, a new model will be trained.
+        """
+        log(INFO, "Training or Fine Tuning TabSyn model...")
+        self.training_config.general.workspace_dir.mkdir(parents=True, exist_ok=True)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            tabsyn = copy.deepcopy(trained_model.models) if trained_model is not None else None
+
+            temp_path = self._populate_temp_dataset_dir(temp_dir, dataset)
+            tabsyn = self._train_or_fine_tune(temp_path, tabsyn)
+
+        synthetic_data: pd.DataFrame | None = None
+        if synthesize:
+            synthetic_data = self._synthesize(tabsyn)
+
+        return EnsembleAttackTrainingResult(
+            save_dir=self.training_config.general.workspace_dir,
+            configs=self.training_config,
+            models=tabsyn,
+            synthetic_data=synthetic_data,
+        )
+
+    def _populate_temp_dataset_dir(self, temp_dir: str, dataset: pd.DataFrame) -> Path:
+        temp_path = Path(temp_dir)
+
+        dataset.to_csv(temp_path / f"{self.table_name}.csv", index=False)
+        for filename in [f"{self.table_name}.json", f"{self.table_name}_info.json"]:
+            shutil.copy(self.full_data_dir / filename, temp_path / filename)
+
+        return temp_path
+
+    def _dataset_and_ref_dataset_paths(self, data_dir: Path) -> tuple[Path, Path]:
+        # The preprocess function below expects 2 folders of preprocessed data:
+        # 1. the dataset to be processed, which can be a subset of the full dataset
+        # 2. the full dataset, which is used to unsure it will get all the categories for the categorical features
+        dataset_path = get_processed_data_dir(data_dir) / self.table_name
+        full_dataset_path = get_processed_data_dir(self.full_data_dir) / self.table_name
+        ref_dataset_path = Path(f"{dataset_path}_all")
+        shutil.rmtree(ref_dataset_path, ignore_errors=True)
+        shutil.copytree(full_dataset_path, ref_dataset_path)
+
+        return dataset_path, ref_dataset_path
+
+    def _train_or_fine_tune(self, data_dir: Path, tabsyn: TabSyn | None) -> TabSyn:
+        fine_tune = tabsyn is not None
+        log(INFO, f"{'Fine Tuning' if fine_tune else 'Training'} new TabSyn model...")
+
+        assert self.training_config.save_dir is not None, "Save dir is not set"
+
+        process_data(self.table_name, data_dir, data_dir)
+
+        log(INFO, "Preprocessing data...")
+
+        dataset_path, ref_dataset_path = self._dataset_and_ref_dataset_paths(data_dir)
+
+        # preprocess the data
+        # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here
+        numerical_features, categorical_features, categories, d_numerical = preprocess(  # type: ignore[misc]
+            dataset_path=dataset_path,
+            ref_dataset_path=ref_dataset_path,
+            transforms=self.tabsyn_config["transforms"],
+            task_type=self.tabsyn_config["task_type"],
+        )
+
+        # separate train and test data
+        numerical_features_train = numerical_features[DataSplit.TRAIN.value]
+        numerical_features_test = numerical_features[DataSplit.TEST.value]
+        categorical_features_train = categorical_features[DataSplit.TRAIN.value]
+        categorical_features_test = categorical_features[DataSplit.TEST.value]
+
+        log(INFO, f"Dataset size: {numerical_features_train.shape[0]}")
+
+        # convert to float tensor
+        numerical_features_train = torch.tensor(numerical_features_train).float()
+        numerical_features_test = torch.tensor(numerical_features_test).float()
+        categorical_features_train = torch.tensor(categorical_features_train)
+        categorical_features_test = torch.tensor(categorical_features_test)
+
+        log(INFO, "Loading the dataset...")
+
+        # create dataset module
+        train_data = TabularDataset(numerical_features_train.float(), categorical_features_train)
+
+        # move test data to gpu if available
+        numerical_features_test = numerical_features_test.float().to(DEVICE)
+        categorical_features_test = categorical_features_test.to(DEVICE)
+
+        # create train dataloader
+        train_loader: DataLoader[TabularDataset] = DataLoader[TabularDataset](
+            # Ignoring here because this is expecting the dataset to be subclass of torch's Dataset but it isn't
+            train_data,  # type: ignore[arg-type]
+            batch_size=self.tabsyn_config["train"]["vae"]["batch_size"],
+            shuffle=True,
+            num_workers=self.tabsyn_config["train"]["vae"]["num_dataset_workers"],
+        )
+
+        if fine_tune:
+            # Load the data into the existing model
+            log(INFO, "Loading data into existing TabSyn model...")
+            assert tabsyn is not None, "TabSyn model is not set"
+            tabsyn.train_loader = train_loader
+            tabsyn.numerical_features_test = numerical_features_test
+            tabsyn.categorical_features_test = categorical_features_test
+        else:
+            # Instantiate a new model
+            log(INFO, "Instantiating the TabSyn model...")
+            tabsyn = TabSyn(
+                train_loader,
+                numerical_features_test,
+                categorical_features_test,
+                num_numerical_features=d_numerical,
+                num_classes=categories,
+                device=DEVICE,
+            )
+
+        ###### A. Train the VAE model ######
+
+        log(INFO, f"{'Fine tuning' if fine_tune else 'Training'} the TabSyn VAE model...")
+
+        self.vae_save_dir.mkdir(parents=True, exist_ok=True)
+        if not fine_tune:
+            # instantiate VAE model for training
+            tabsyn.instantiate_vae(
+                **self.tabsyn_config["model_params"],
+                optim_params=self.tabsyn_config["train"]["optim"]["vae"],
+            )
+
+        tabsyn.train_vae(
+            **self.tabsyn_config["loss_params"],
+            num_epochs=self.tabsyn_config["train"]["vae"]["num_epochs"],
+            save_path=self.vae_save_dir,
+        )
+
+        # embed all inputs in the latent space
+        tabsyn.save_vae_embeddings(
+            numerical_features_train,
+            categorical_features_train,
+            vae_ckpt_dir=self.vae_save_dir,
+        )
+        tabsyn.save_embeddings_attributes(vae_ckpt_dir=self.vae_save_dir)
+
+        ###### B. Train the Diffusion model ######
+
+        log(INFO, f"{'Fine tuning' if fine_tune else 'Training'} the TabSyn Diffusion model...")
+
+        # load latent space embeddings
+        train_z, _ = tabsyn.load_latent_embeddings(self.vae_save_dir)  # train_z dim: B x in_dim
+
+        # normalize embeddings
+        latent_train_data = (train_z - train_z.mean(0)) / 2
+
+        # create data loader
+        latent_train_loader: DataLoader[torch.Tensor] = DataLoader[torch.Tensor](
+            # Ignoring the type checker here because our code in tabsyn.train_diffusion
+            # works with plain Tensor and not with TensorDataset
+            latent_train_data,  # type: ignore[arg-type]
+            batch_size=self.tabsyn_config["train"]["diffusion"]["batch_size"],
+            shuffle=True,
+            num_workers=self.tabsyn_config["train"]["diffusion"]["num_dataset_workers"],
+        )
+
+        if not fine_tune:
+            # instantiate diffusion model for training
+            tabsyn.instantiate_diffusion(
+                in_dim=train_z.shape[1],
+                optim_params=self.tabsyn_config["train"]["optim"]["diffusion"],
+            )
+
+        # train diffusion model
+        tabsyn.train_diffusion(
+            latent_train_loader,
+            num_epochs=self.tabsyn_config["train"]["diffusion"]["num_epochs"],
+            ckpt_path=self.training_config.save_dir,
+        )
+
+        log(INFO, f"{'Fine tuning' if fine_tune else 'Training'} Done!")
+
+        return tabsyn
+
+    def _synthesize(self, tabsyn: TabSyn) -> pd.DataFrame:
+        ###### Load the model ######
+
+        assert self.training_config.save_dir is not None, "Save dir is not set"
+
+        # instantiate VAE model
+        tabsyn.instantiate_vae(**self.tabsyn_config["model_params"], optim_params=None)
+
+        # load latent embeddings attributes of input data
+        train_z_att = tabsyn.load_embeddings_attributes(self.vae_save_dir)
+        token_dim = train_z_att["token_dim"]
+        in_dim = train_z_att["in_dim"]
+
+        # instantiate diffusion model
+        tabsyn.instantiate_diffusion(in_dim=in_dim, optim_params=None)
+
+        # load state from checkpoint
+        tabsyn.load_model_state(ckpt_dir=self.training_config.save_dir, dif_ckpt_name="model.pt")
+
+        ###### Synthesize data ######
+
+        dataset_path, ref_dataset_path = self._dataset_and_ref_dataset_paths(self.full_data_dir)
+
+        # get inverse tokenizers
+        # TODO: refactor the return of the preprocess function so we don't need to ignore mypy here
+        _, _, _, _, num_inverse, cat_inverse = preprocess(  # type: ignore[misc]
+            dataset_path=dataset_path,
+            ref_dataset_path=ref_dataset_path,
+            transforms=self.tabsyn_config["transforms"],
+            task_type=self.tabsyn_config["task_type"],
+            inverse=True,
+        )
+
+        synthetic_data_dir = self.training_config.save_dir / "synthetic_data"
+        synthetic_data_dir.mkdir(parents=True, exist_ok=True)
+
+        # load data info file
+        with open(dataset_path / "info.json", "r") as file:
+            data_info = json.load(file)
+
+        data_info["token_dim"] = token_dim
+
+        # sample data
+        num_samples = train_z_att["num_samples"]
+        in_dim = train_z_att["in_dim"]
+        mean_input_emb = train_z_att["mean_input_emb"]
+        tabsyn.sample(
+            num_samples,
+            in_dim,
+            mean_input_emb,
+            info=data_info,
+            num_inverse=num_inverse,
+            cat_inverse=cat_inverse,
+            save_path=synthetic_data_dir / f"{self.table_name}_synthetic.csv",
+        )
+
+        log(INFO, "Synthesizing Done!")
+
+        return pd.read_csv(synthetic_data_dir / f"{self.table_name}_synthetic.csv")
diff --git a/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py b/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py
index b41c7cbf..7ddf4148 100644
--- a/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py
+++ b/src/midst_toolkit/attacks/ensemble/rmia/rmia_calculation.py
@@ -123,6 +123,9 @@ def compute_gower_for_model(
     if len(df_synthetic) > min_length:
         df_synthetic = df_synthetic.sample(n=min_length, random_state=random_seed)
 
+    # Drop id columns
+    id_columns_names = [col for col in df_synthetic.columns if "_id" in col]
+    df_synthetic = df_synthetic.drop(columns=id_columns_names)
     if id_column_name in df_synthetic.columns:
         df_synthetic = df_synthetic.drop(columns=[id_column_name])
 
diff --git a/src/midst_toolkit/models/tabsyn/dataset.py b/src/midst_toolkit/models/tabsyn/dataset.py
index e720d46c..5761cd57 100644
--- a/src/midst_toolkit/models/tabsyn/dataset.py
+++ b/src/midst_toolkit/models/tabsyn/dataset.py
@@ -52,11 +52,7 @@ def preprocess(
     task_type: TaskType = TaskType.BINARY_CLASSIFICATION,
     inverse: bool = False,
     concat: bool = True,
-) -> (
-    Dataset
-    | tuple[ArrayDict, ArrayDict, list[int] | None, int]
-    | tuple[ArrayDict, ArrayDict, list[int] | None, int, Any, Any]
-):
+) -> Dataset | tuple[ArrayDict, ArrayDict, list[int], int] | tuple[ArrayDict, ArrayDict, list[int], int, Any, Any]:
     """Preprocess the dataset.
 
     Args:
@@ -197,7 +193,7 @@ def make_dataset(
     return transform_dataset(dataset, transformations, None)
 
 
-def get_categories(categorical_features_train: np.ndarray | None) -> list[int] | None:
+def get_categories(categorical_features_train: np.ndarray | None) -> list[int]:
     """Get the length of the unique categories from the categorical features.
 
     Args:
@@ -207,7 +203,7 @@ def get_categories(categorical_features_train: np.ndarray | None) -> list[int] |
         The length of the unique categories for each feature.
     """
     if categorical_features_train is None:
-        return None
+        return []
     return [len(set(categorical_features_train[:, i])) for i in range(categorical_features_train.shape[1])]
 
 
diff --git a/src/midst_toolkit/models/tabsyn/pipeline.py b/src/midst_toolkit/models/tabsyn/pipeline.py
index 8977f834..e3ee667e 100644
--- a/src/midst_toolkit/models/tabsyn/pipeline.py
+++ b/src/midst_toolkit/models/tabsyn/pipeline.py
@@ -60,9 +60,22 @@ def __init__(
         self.device = device
 
     def instantiate_vae(
-        self, n_head: int, factor: int, num_layers: int, d_token: int, optim_params: dict[str, Any]
+        self,
+        n_head: int,
+        factor: int,
+        num_layers: int,
+        d_token: int,
+        optim_params: dict[str, Any] | None = None,
     ) -> None:
-        """Construct VAE model and its optimizer and lr scheduler."""
+        """Construct VAE model and its optimizer and lr scheduler.
+
+        Args:
+            n_head: The number of heads.
+            factor: The factor for the dimension of the hidden layer.
+            num_layers: The number of layers.
+            d_token: The dimension of the token.
+            optim_params: The optimizer parameters. Optional, defaults to None.
+        """
         # construct vae model
         self.vae_model, self.pre_encoder, self.pre_decoder = self.__get_vae_model(n_head, factor, num_layers, d_token)
         # construct vae optimizer and scheduler
@@ -70,8 +83,13 @@ def instantiate_vae(
             self.vae_optimizer, self.vae_scheduler = self.__load_optim(self.vae_model, **optim_params)
         log(INFO, "Successfully instantiated VAE model.")
 
-    def instantiate_diffusion(self, in_dim: int, optim_params: dict[str, Any]) -> None:
-        """Construct Diffusion model and its optimizer and lr scheduler."""
+    def instantiate_diffusion(self, in_dim: int, optim_params: dict[str, Any] | None) -> None:
+        """Construct Diffusion model and its optimizer and lr scheduler.
+
+        Args:
+            in_dim: The dimension of the input.
+            optim_params: The optimizer parameters. Optional, defaults to None.
+        """
         # load diffusion model
         self.diffusion_model = self.__get_diffusion_model(input_dimension=in_dim)
         # load optimizer and scheduler
diff --git a/src/midst_toolkit/models/tabsyn/preprocessing.py b/src/midst_toolkit/models/tabsyn/preprocessing.py
index 7f790ce3..0d78378e 100644
--- a/src/midst_toolkit/models/tabsyn/preprocessing.py
+++ b/src/midst_toolkit/models/tabsyn/preprocessing.py
@@ -125,7 +125,7 @@ def train_val_test_split(
 
 
 # TODO: Refactor this function to get rid of the "too many statements" and "too many branches" errors from Ruff
-def process_data(name: str, info_path: Path, data_dir: Path) -> None:
+def process_data(name: str, info_path: Path, data_dir: Path, data_name: str | None = None) -> None:
     # ruff: noqa: PLR0915, PLR0912
     """Process the data for the given dataset name.
 
@@ -135,17 +135,39 @@ def process_data(name: str, info_path: Path, data_dir: Path) -> None:
         name: The name of the dataset.
         info_path: The directory where the info file is located.
         data_dir: The directory where the raw data is located.
+        data_name: The name of the data file. If not provided, the data file will be named after the dataset name.
     """
+    if data_name is None:
+        data_name = name
+
+    raw_data_dir = data_dir / "raw_data"
     processed_data_dir = get_processed_data_dir(data_dir)
 
-    with open(info_path / f"{name}.json", "r") as f:
+    with open(info_path / f"{name}_info.json", "r") as f:
         info = json.load(f)
 
-    data_path = info["data_path"]
-    if info["file_type"] == "csv":
+    data_path: Path
+    test_path: Path | None = None
+    if (raw_data_dir / "train.csv").exists():
+        data_path = raw_data_dir / "train.csv"
+        test_path = raw_data_dir / "test.csv"
+    elif (raw_data_dir / "train.xls").exists():
+        data_path = raw_data_dir / "train.xls"
+        test_path = raw_data_dir / "test.xls"
+    else:
+        data_path = data_dir / f"{data_name}.csv"
+
+    assert data_path.exists(), (
+        f"Train data not found in the expected paths. Expected paths are: {data_dir}/{data_name}.csv, {raw_data_dir}/train.csv, {raw_data_dir}/train.xls."
+    )
+    assert test_path is None or test_path.exists(), (
+        f"Test data path not found in the expected paths. Expected paths are: {raw_data_dir}/test.csv, {raw_data_dir}/test.xls."
+    )
+
+    if data_path.suffix == ".csv":
         data_df = pd.read_csv(data_path, header=info["header"])
 
-    elif info["file_type"] == "xls":
+    elif data_path.suffix == ".xls":
         data_df = pd.read_excel(data_path, sheet_name="Data", header=1)
         data_df = data_df.drop("ID", axis=1)
 
@@ -168,9 +190,8 @@ def process_data(name: str, info_path: Path, data_dir: Path) -> None:
     cat_columns = [column_names[i] for i in cat_col_idx]
     target_columns = [column_names[i] for i in target_col_idx]
 
-    if info["test_path"]:
+    if test_path:
         # if testing data is given
-        test_path = info["test_path"]
         test_df = pd.read_csv(test_path)
         train_df = data_df
     else:
diff --git a/tests/integration/assets/tabsyn/trans.json b/tests/integration/assets/tabsyn/trans_info.json
similarity index 71%
rename from tests/integration/assets/tabsyn/trans.json
rename to tests/integration/assets/tabsyn/trans_info.json
index 2ffb2563..7bb3da33 100644
--- a/tests/integration/assets/tabsyn/trans.json
+++ b/tests/integration/assets/tabsyn/trans_info.json
@@ -25,8 +25,5 @@
     ],
     "target_col_idx": [
         3
-    ],
-    "file_type": "csv",
-    "data_path": "tests/integration/assets/tabsyn/raw_data/train.csv",
-    "test_path": "tests/integration/assets/tabsyn/raw_data/test.csv"
+    ]
 }
diff --git a/tests/integration/models/tabsyn/test_train.py b/tests/integration/models/tabsyn/test_train.py
index 596cab69..c2ed6c7b 100644
--- a/tests/integration/models/tabsyn/test_train.py
+++ b/tests/integration/models/tabsyn/test_train.py
@@ -7,6 +7,7 @@
 from torch.utils.data import DataLoader
 
 from midst_toolkit.common.enumerations import DataSplit
+from midst_toolkit.common.variables import DEVICE
 from midst_toolkit.models.tabsyn.config import load_config
 from midst_toolkit.models.tabsyn.dataset import TabularDataset, preprocess
 from midst_toolkit.models.tabsyn.pipeline import TabSyn
@@ -67,9 +68,8 @@ def test_train_load_and_synthesize(test_dirs):
     train_data = TabularDataset(numerical_features_train.float(), categorical_features_train)
 
     # move test data to gpu if available
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    numerical_features_test = numerical_features_test.float().to(device)
-    categorical_features_test = categorical_features_test.to(device)
+    numerical_features_test = numerical_features_test.float().to(DEVICE)
+    categorical_features_test = categorical_features_test.to(DEVICE)
 
     # create train dataloader
     train_loader = DataLoader(
@@ -86,7 +86,7 @@ def test_train_load_and_synthesize(test_dirs):
         categorical_features_test,
         num_numerical_features=d_numerical,
         num_classes=categories,
-        device=device,
+        device=DEVICE,
     )
 
     model_save_dir = results_dir / test_data_name