VectorInstitute · lotif · Mar 23, 2026 · Mar 23, 2026 · Apr 6, 2026 · Apr 8, 2026
diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
@@ -29,10 +29,10 @@ jobs:
   run-code-check:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -42,10 +42,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v6.0.2
+        uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v8.1.0
+        uses: astral-sh/setup-uv@v8.2.0
         with:
           version: "0.5.21"
           enable-cache: true
@@ -77,7 +77,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v6.0.2
+        uses: actions/checkout@v7.0.0
 
       - name: Configure Git Credentials
         run: |

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
@@ -41,10 +41,10 @@ jobs:
   integration-tests:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -16,10 +16,10 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install libcurl4-openssl-dev libssl-dev
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -41,10 +41,10 @@ jobs:
   unit-tests:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v6.0.2
+      - uses: actions/checkout@v7.0.0
 
       - name: Install uv
-        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39
         with:
           # Install a specific version of uv.
           version: "0.5.21"

diff --git a/examples/gan/README.md b/examples/gan/README.md
@@ -94,7 +94,7 @@ Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix
 and Mutual Information Difference.
 
 To compute those metrics, you can run the command below. The name of the table should be
-defined in the `dataset_meta.json` file, and the file for synthetic data should be under
+defined in the `dataset_meta.json` file, and the data files should be under
 `/data/{table_name}.csv` for the real data and `/results/{table_name}_synthetic.csv`
 for the synthetic data.
 

diff --git a/examples/gan/ensemble_attack/README.md b/examples/gan/ensemble_attack/README.md
@@ -60,10 +60,6 @@ python -m examples.gan.ensemble_attack.make_challenge_dataset
 
 ## 4. Training the attack model
 
-> [!NOTE]
-> In the [`config.yaml`](config.yaml) file, the attribute `ensemble_attack.shadow_training.model_name`
-> is what determines this attack will be run with the CTGAN model.
-
 To train the attack models, execute the following command:
 
 ```bash
@@ -81,7 +77,7 @@ To test the attack model against the target model and synthetic data produced on
 [step 2](#2-generating-target-synthetic-data-to-be-tested), please run:
 
 ```bash
-python -m examples.gan.ensemble_attack.test_attack_model
+python -m examples.gan.ensemble_attack.run_test_attack_model
 ```
 
 ## 6. Compute the attack success

diff --git a/examples/gan/ensemble_attack/config.yaml b/examples/gan/ensemble_attack/config.yaml
@@ -36,7 +36,6 @@ ensemble_attack:
     run_metaclassifier_training: true
 
   shadow_training:
-    model_name: ctgan
     model_config:  # Configurations specific for the CTGAN model
       training:
         epochs: 300

diff --git a/examples/gan/ensemble_attack/make_challenge_dataset.py b/examples/gan/ensemble_attack/make_challenge_dataset.py
@@ -39,6 +39,8 @@ def make_challenge_dataset(config: DictConfig) -> None:
     log(INFO, f"Saving challenge labels to {challenge_label_path}")
     np.save(challenge_label_path, challenge_data_labels)
 
+    log(INFO, "Done!")
+
 
 if __name__ == "__main__":
     make_challenge_dataset()
diff --git a/.../gan/ensemble_attack/test_attack_model.py → .../ensemble_attack/run_test_attack_model.py b/.../gan/ensemble_attack/test_attack_model.py → .../ensemble_attack/run_test_attack_model.py
diff --git a/examples/gan/ensemble_attack/utils.py b/examples/gan/ensemble_attack/utils.py
@@ -40,6 +40,14 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]:
     Returns:
         The ensemble attack training config for the CTGAN model.
     """
+    base_data_dir = str
+    if "base_data_dir" in config:
+        base_data_dir = config.base_data_dir
+    elif "data_dir" in config:
+        base_data_dir = config.data_dir
+    else:
+        raise ValueError("Either base_data_dir or data_dir must be provided in the config.")
+
     # Saving the model config from the config.yaml into a json file
     # because that's what the ensemble attack code will be looking for
     training_config_path = Path(config.ensemble_attack.shadow_training.training_json_config_paths.training_config_path)
@@ -48,10 +56,10 @@ def make_training_config(config: DictConfig) -> dict[Any, Any]:
         training_config = OmegaConf.to_container(config.ensemble_attack.shadow_training.model_config, resolve=True)
         assert isinstance(training_config, dict), "Training config must be a dictionary."
         training_config["general"] = {
-            "test_data_dir": config.base_data_dir,
+            "test_data_dir": base_data_dir,
             "sample_prefix": "ctgan",
-            "data_dir": config.base_data_dir,
-            "workspace_dir": str(Path(config.base_data_dir) / "shadow_workspace"),
+            "data_dir": base_data_dir,
+            "workspace_dir": str(Path(base_data_dir) / "shadow_workspace"),
             "exp_name": "pre_trained_model",
         }
         json.dump(training_config, f)

diff --git a/examples/tabsyn/README.md b/examples/tabsyn/README.md
@@ -0,0 +1,92 @@
+# TabSyn Single Table Example
+
+This example will go over training a single-table [TabSyn](https://arxiv.org/abs/2310.09656)
+model and synthesizing data afterwards.
+
+
+## Downloading data
+
+First, we need the data. Download it from this
+[Google Drive link](https://drive.google.com/file/d/1HTgfgeL5GXc8uAGfeQirJrUynK7vFeyb/view?usp=drive_link),
+extract the files and place them in a `/data` folder in within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the data folder, you can do so by editing the `base_data_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+Here is a description of the files that have been extracted:
+- `trans.csv`: The training data. It consists of information about bank transactions and it
+contains 20,000 data points.
+- `trans_info.json`: Metadata about the `trans.csv` data, with information such as which columns are
+numerical and which are categorical, what is the task type, etc.
+
+
+## Kicking off training
+
+To kick off training, simply run the command below from the project's root folder:
+
+```bash
+python -m examples.tabsyn.train
+```
+
+
+## Training results
+
+The result files will be saved inside a `/results` folder within this folder
+(`examples/tabsyn`).
+
+> [!NOTE]
+> If you wish to change the save folder, you can do so by editing the `results_dir` attribute
+> of the [`config.yaml`](config.yaml) file.
+
+In the `/results/trans` folder, there will be a file called `model.pt`,
+which is a pytorch saved model.
+
+
+## Synthesizing data
+
+To synthesize some data with the trained model, run:
+
+```bash
+python -m examples.tabsyn.synthesize
+```
+
+If there is already a trained model in the `/results` folder, it will use that model.
+Otherwise it will train one from scratch. At the end of the script, it will save the
+synthesized data to `/results/trans/synthetic_data/trans_synthetic.csv`.
+
+
+## Evaluating the quality of the synthetic data
+
+### Alpha Precision
+
+To run a round of evaluation with [Alpha Precision](https://arxiv.org/abs/2301.07573)
+metrics on a set of synthetic data, run the `evaluate.py` script:
+
+```bash
+python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \
+  --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \
+  --real_data examples/tabsyn/data/trans_sampled.csv \
+  --meta_info_path examples/gan/data/meta_info.json \
+  --save_directory examples/gan/results/
+```
-python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \
-  --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \
-  --real_data examples/tabsyn/data/trans_sampled.csv \
-  --meta_info_path examples/gan/data/meta_info.json \
-  --save_directory examples/gan/results/
-```
+python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \
+  --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \
+  --real_data examples/tabsyn/data/trans_sampled.csv \
+  --meta_info_path examples/tabsyn/data/meta_info.json \
+  --save_directory examples/tabsyn/results/
-python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \
-  --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \
-  --real_data examples/tabsyn/data/trans_sampled.csv \
-  --meta_info_path examples/gan/data/meta_info.json \
-  --save_directory examples/gan/results/
-```
+python -m midst_toolkit.evaluation.quality.scripts.midst_alpha_precision_eval \
+  --synthetic_data_path examples/tabsyn/results/trans/synthetic_data/trans_synthetic.csv \
+  --real_data examples/tabsyn/data/trans_sampled.csv \
+  --meta_info_path examples/tabsyn/data/meta_info.json \
+  --save_directory examples/tabsyn/results/
+
+It will save the evaluation results under the `/results/model.txt` file.
+
+### Additional Metrics
+
+The calculation of additional metrics are set up in the `evaluate.py` file. They are the
+Kolmogorov-Smirnov (KS) test, Total Variation Distance (TVD), Correlation Matrix Difference
+and Mutual Information Difference.
+
+To compute those metrics, you can run the command below. The data files should
+be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_samples.csv`
+for the sampleed data used for training, and `/results/{table_name}_synthetic.csv`
+for the synthetic data.
-To compute those metrics, you can run the command below. The data files should
-be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_samples.csv`
-for the sampleed data used for training, and `/results/{table_name}_synthetic.csv`
-for the synthetic data.
+To compute those metrics, you can run the command below. The data files should
+be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_sampled.csv`
+for the sampled data used for training, and `/results/{table_name}_synthetic.csv`
+for the synthetic data.
-To compute those metrics, you can run the command below. The data files should
-be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_samples.csv`
-for the sampleed data used for training, and `/results/{table_name}_synthetic.csv`
-for the synthetic data.
+To compute those metrics, you can run the command below. The data files should
+be under `/data/{table_name}.csv` for the real data, `/data/{table_name}_sampled.csv`
+for the sampled data used for training, and `/results/{table_name}_synthetic.csv`
+for the synthetic data.
+
+```bash
+python -m examples.tabsyn.evaluate
+```
+
+The results will be saved in the `/results/evaluation.json` file.
diff --git a/examples/tabsyn/config.yaml b/examples/tabsyn/config.yaml
@@ -0,0 +1,85 @@
+data_dir: examples/tabsyn/data
+results_dir: examples/tabsyn/results
+tabsyn_config: examples/tabsyn/tabsyn_config.toml
+table_name: trans
+
+training:
+  sample_size: 20000
+
+evaluation:
+  # Reporting Configuration
+  write_report: True
+  metric_report_path: ${results_dir}/evaluation_results.txt
+
+  # Privacy Evaluation Configurations
+  dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  median_dcr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  hitting_rate:
+    run: True
+    hitting_threshold: 0.03
+
+  eir:
+    run: True
+    norm: "gower"
+
+  nndr:
+    run: True
+    norm: "l2"
+    batch_size: 1000
+
+  # Quality Evaluation Configurations
+  ks_tv:
+    run: True
+    significance_level: 0.05
+    permutations: 1000
+
+  alpha_precision:
+    run: True
+    naive_only: False
+
+  ci_overlap:
+    run: True
+    confidence_level: 95
+
+  correlation_diff:
+    run: True
+    compute_mixed_correlations: True
+
+  mean_diff:
+    run: True
+
+  f1_score_diff:
+    run: True
+    label_column: "trans_type"
+    folds: 5
+    f1_type: "macro"
+
+  regression_score_diff:
+    run: True
+    label_column: "trans_type"
+    preprocess_labels: True
+    measure_metrics_in_original_label_space: False
+    verbose: True
+    convert_label_to_float: True
+
+  hellinger:
+    run: True
+    include_numerical_columns: True
+
+  propensity_mse:
+    run: True
+    folds: 5
+    max_iterations: 50
+    solver: "liblinear"
+
+  mutual_information:
+    run: True
+    include_numerical_columns: False