From ca438bd1419849775a08d366416486ba2c03ccdc Mon Sep 17 00:00:00 2001 From: Houman Safaai Date: Mon, 23 Feb 2026 20:22:43 -0500 Subject: [PATCH 1/7] update two-flow --- ...bilenetv2_cifar100_v8_hybrid_seed1011.yaml | 323 +++++++++++ ...obilenetv2_cifar100_v8_hybrid_seed123.yaml | 323 +++++++++++ ...mobilenetv2_cifar100_v8_hybrid_seed42.yaml | 323 +++++++++++ ...obilenetv2_cifar100_v8_hybrid_seed456.yaml | 323 +++++++++++ ...obilenetv2_cifar100_v8_hybrid_seed789.yaml | 323 +++++++++++ .../resnet18_cifar100_v8_hybrid_seed1011.yaml | 323 +++++++++++ .../resnet18_cifar100_v8_hybrid_seed123.yaml | 323 +++++++++++ .../resnet18_cifar100_v8_hybrid_seed42.yaml | 323 +++++++++++ .../resnet18_cifar100_v8_hybrid_seed456.yaml | 323 +++++++++++ .../resnet18_cifar100_v8_hybrid_seed789.yaml | 323 +++++++++++ .../vision_prune/paper_2026_v8/submit_v8.sh | 34 ++ .../paper_2026_v8/v8_config_list.txt | 15 + .../vgg16_cifar100_v8_hybrid_seed1011.yaml | 323 +++++++++++ .../vgg16_cifar100_v8_hybrid_seed123.yaml | 323 +++++++++++ .../vgg16_cifar100_v8_hybrid_seed42.yaml | 323 +++++++++++ .../vgg16_cifar100_v8_hybrid_seed456.yaml | 323 +++++++++++ .../vgg16_cifar100_v8_hybrid_seed789.yaml | 323 +++++++++++ src/alignment/configs/config_loader.py | 49 ++ src/alignment/experiments/base.py | 38 ++ .../experiments/cluster_experiments.py | 549 +++++++++++++++++- .../pruning/strategies/generalized_taylor.py | 33 +- tests/unit/metrics/test_neurips_controls.py | 40 ++ tests/unit/test_experiments.py | 2 + 23 files changed, 5576 insertions(+), 29 deletions(-) create mode 100644 configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed789.yaml create mode 100755 configs/vision_prune/paper_2026_v8/submit_v8.sh create mode 100644 configs/vision_prune/paper_2026_v8/v8_config_list.txt create mode 100644 configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed789.yaml create mode 100644 tests/unit/metrics/test_neurips_controls.py diff --git a/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed1011.yaml new file mode 100644 index 00000000..408184c1 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed1011.yaml @@ -0,0 +1,323 @@ +{ + "name": "mobilenetv2_cifar100_cluster_analysis_seed1011_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed123.yaml new file mode 100644 index 00000000..5e84a21f --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed123.yaml @@ -0,0 +1,323 @@ +{ + "name": "mobilenetv2_cifar100_cluster_analysis_seed123_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed42.yaml new file mode 100644 index 00000000..a930d330 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed42.yaml @@ -0,0 +1,323 @@ +{ + "name": "mobilenetv2_cifar100_cluster_analysis_seed42_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed456.yaml new file mode 100644 index 00000000..1b8d296e --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed456.yaml @@ -0,0 +1,323 @@ +{ + "name": "mobilenetv2_cifar100_cluster_analysis_seed456_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed789.yaml new file mode 100644 index 00000000..515b2466 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed789.yaml @@ -0,0 +1,323 @@ +{ + "name": "mobilenetv2_cifar100_cluster_analysis_seed789_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed1011.yaml new file mode 100644 index 00000000..6e1a442c --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed1011.yaml @@ -0,0 +1,323 @@ +{ + "name": "resnet18_cifar100_cluster_analysis_seed1011_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed123.yaml new file mode 100644 index 00000000..b18c92b2 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed123.yaml @@ -0,0 +1,323 @@ +{ + "name": "resnet18_cifar100_cluster_analysis_seed123_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed42.yaml new file mode 100644 index 00000000..c0fc8ea6 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed42.yaml @@ -0,0 +1,323 @@ +{ + "name": "resnet18_cifar100_cluster_analysis_seed42_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed456.yaml new file mode 100644 index 00000000..8e184d2d --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed456.yaml @@ -0,0 +1,323 @@ +{ + "name": "resnet18_cifar100_cluster_analysis_seed456_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed789.yaml new file mode 100644 index 00000000..455139ba --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed789.yaml @@ -0,0 +1,323 @@ +{ + "name": "resnet18_cifar100_cluster_analysis_seed789_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/submit_v8.sh b/configs/vision_prune/paper_2026_v8/submit_v8.sh new file mode 100755 index 00000000..71445735 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/submit_v8.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --job-name=v8_hybrid +#SBATCH --partition=kempner_h100 +#SBATCH --account=kempner_dev +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH --time=04:00:00 +#SBATCH --array=1-15%16 +#SBATCH --output=/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8/slurm_%A_%a.out +#SBATCH --error=/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8/slurm_%A_%a.err + +module load cuda/12.2.0-fasrc01 +module load gcc/12.2.0-fasrc01 + +# Activate conda +eval "$(conda shell.bash hook)" +conda activate networkAlignmentAnalysis + +cd /n/holylabs/kempner_dev/Users/hsafaai/Code/alignment + +# Get config for this array task +CONFIG=$(sed -n "${SLURM_ARRAY_TASK_ID}p" configs/vision_prune/paper_2026_v8/v8_config_list.txt) + +echo "=========================================" +echo "Job $SLURM_ARRAY_JOB_ID task $SLURM_ARRAY_TASK_ID" +echo "Config: $CONFIG" +echo "Node: $(hostname)" +echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" +echo "=========================================" + +python scripts/run_experiment.py --config "$CONFIG" --allow-dirty diff --git a/configs/vision_prune/paper_2026_v8/v8_config_list.txt b/configs/vision_prune/paper_2026_v8/v8_config_list.txt new file mode 100644 index 00000000..f35c0f6d --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/v8_config_list.txt @@ -0,0 +1,15 @@ +configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed42.yaml +configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed123.yaml +configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed456.yaml +configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed789.yaml +configs/vision_prune/paper_2026_v8/resnet18_cifar100_v8_hybrid_seed1011.yaml +configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed42.yaml +configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed123.yaml +configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed456.yaml +configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed789.yaml +configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed1011.yaml +configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed42.yaml +configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed123.yaml +configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed456.yaml +configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed789.yaml +configs/vision_prune/paper_2026_v8/mobilenetv2_cifar100_v8_hybrid_seed1011.yaml diff --git a/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed1011.yaml new file mode 100644 index 00000000..a473f1ac --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed1011.yaml @@ -0,0 +1,323 @@ +{ + "name": "vgg16_cifar100_cluster_analysis_seed1011_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed123.yaml new file mode 100644 index 00000000..3d9f4099 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed123.yaml @@ -0,0 +1,323 @@ +{ + "name": "vgg16_cifar100_cluster_analysis_seed123_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed42.yaml new file mode 100644 index 00000000..a9c7991e --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed42.yaml @@ -0,0 +1,323 @@ +{ + "name": "vgg16_cifar100_cluster_analysis_seed42_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed456.yaml new file mode 100644 index 00000000..bc9caee0 --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed456.yaml @@ -0,0 +1,323 @@ +{ + "name": "vgg16_cifar100_cluster_analysis_seed456_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed789.yaml new file mode 100644 index 00000000..b3624e6c --- /dev/null +++ b/configs/vision_prune/paper_2026_v8/vgg16_cifar100_v8_hybrid_seed789.yaml @@ -0,0 +1,323 @@ +{ + "name": "vgg16_cifar100_cluster_analysis_seed789_rq_exact_tm_simple", + "description": "v8 hybrid: Taylor allocation + IXY ranking", + "tags": [ + "v8", + "hybrid_taylor_alloc" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": true, + "training_epochs": 10, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": null, + "scheduler_config": {}, + "weight_decay": 0.0, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": {}, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": null, + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "rq_definition": "covariance_exact", + "type_mapping_mode": "global_simple", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "ixy", + "clustering_importance_mode": "geometric", + "compute_loss_proxy": false, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 2.0, + "cluster_aware_beta": 0.0, + "cluster_aware_gamma": 0.25, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "composite_ixy", + "cluster_aware_spectral_rs_ixy", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 20, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "hybrid_taylor_allocation": true, + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 0.85, + "pruning_enforce_exact_global_channel_budget": true, + "fine_tune_learning_rate": null, + "fine_tune_max_batches": null, + "fine_tune_weight_decay": 0.0, + "fine_tune_type_aware_enabled": true, + "fine_tune_type_aware_methods": [], + "fine_tune_type_aware_lr_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.5, + "background": 1.5 + }, + "fine_tune_type_aware_wd_multipliers": { + "critical": 0.5, + "synergistic": 1.0, + "redundant": 1.25, + "background": 1.5 + }, + "fine_tune_type_aware_scale_batchnorm": true, + "fine_tune_type_aware_scale_classifier": false, + "fine_tune_track_epoch_accuracy": true, + "fine_tune_eval_frequency": 5, + "fine_tune_use_amp": true, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./logs/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./logs", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_v8", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/src/alignment/configs/config_loader.py b/src/alignment/configs/config_loader.py index 9e002feb..c87037d8 100644 --- a/src/alignment/configs/config_loader.py +++ b/src/alignment/configs/config_loader.py @@ -1228,6 +1228,27 @@ def _map_nested_to_flat_config(nested_config: Dict[str, Any]) -> Dict[str, Any]: flat_config["cluster_aware_late_alpha"] = float(ca["late_alpha"]) if "late_gamma" in ca: flat_config["cluster_aware_late_gamma"] = float(ca["late_gamma"]) + # Two-axis adaptive settings + if "twoaxis_depth_switch" in ca: + flat_config["cluster_aware_twoaxis_depth_switch"] = float(ca["twoaxis_depth_switch"]) + if "twoaxis_depth_sharpness" in ca: + flat_config["cluster_aware_twoaxis_depth_sharpness"] = float(ca["twoaxis_depth_sharpness"]) + if "twoaxis_ixy_weight" in ca: + flat_config["cluster_aware_twoaxis_ixy_weight"] = float(ca["twoaxis_ixy_weight"]) + if "twoaxis_red_internal_weight" in ca: + flat_config["cluster_aware_twoaxis_red_internal_weight"] = float(ca["twoaxis_red_internal_weight"]) + if "twoaxis_syn_weight" in ca: + flat_config["cluster_aware_twoaxis_syn_weight"] = float(ca["twoaxis_syn_weight"]) + if "twoaxis_early_task_weight" in ca: + flat_config["cluster_aware_twoaxis_early_task_weight"] = float(ca["twoaxis_early_task_weight"]) + if "twoaxis_late_task_weight" in ca: + flat_config["cluster_aware_twoaxis_late_task_weight"] = float(ca["twoaxis_late_task_weight"]) + if "twoaxis_red_target_weight" in ca: + flat_config["cluster_aware_twoaxis_red_target_weight"] = float(ca["twoaxis_red_target_weight"]) + if "twoaxis_use_pid_red_target" in ca: + flat_config["cluster_aware_twoaxis_use_pid_red_target"] = bool(ca["twoaxis_use_pid_red_target"]) + if "twoaxis_halo_weight" in ca: + flat_config["cluster_aware_twoaxis_halo_weight"] = float(ca["twoaxis_halo_weight"]) # Generalized Taylor pruning configuration (vision) if isinstance(pruning_block.get("generalized_taylor"), dict): @@ -1443,6 +1464,24 @@ def _map_nested_to_flat_config(nested_config: Dict[str, Any]) -> Dict[str, Any]: elif "base_output_dir" in nested_config: flat_config["base_output_dir"] = nested_config["base_output_dir"] + # ----------------------------------------------------------------------- + # Passthrough: copy any top-level keys from the input that are valid + # ExperimentConfig fields but were not explicitly mapped above. This + # prevents new dataclass fields (e.g. model_checkpoint, hybrid_taylor_allocation) + # from being silently dropped when loading already-flat JSON/YAML configs. + # ----------------------------------------------------------------------- + try: + import dataclasses as _dc + + from alignment.experiments.base import ExperimentConfig as _EC + + valid_fields = {f.name for f in _dc.fields(_EC)} + for key, value in nested_config.items(): + if key in valid_fields and key not in flat_config: + flat_config[key] = value + except Exception: + pass # Graceful fallback if ExperimentConfig cannot be imported + return flat_config @@ -1581,6 +1620,16 @@ def load_config_with_overrides( "pruning.cluster_aware.early_gamma": "cluster_aware_early_gamma", "pruning.cluster_aware.late_alpha": "cluster_aware_late_alpha", "pruning.cluster_aware.late_gamma": "cluster_aware_late_gamma", + "pruning.cluster_aware.twoaxis_depth_switch": "cluster_aware_twoaxis_depth_switch", + "pruning.cluster_aware.twoaxis_depth_sharpness": "cluster_aware_twoaxis_depth_sharpness", + "pruning.cluster_aware.twoaxis_ixy_weight": "cluster_aware_twoaxis_ixy_weight", + "pruning.cluster_aware.twoaxis_red_internal_weight": "cluster_aware_twoaxis_red_internal_weight", + "pruning.cluster_aware.twoaxis_syn_weight": "cluster_aware_twoaxis_syn_weight", + "pruning.cluster_aware.twoaxis_early_task_weight": "cluster_aware_twoaxis_early_task_weight", + "pruning.cluster_aware.twoaxis_late_task_weight": "cluster_aware_twoaxis_late_task_weight", + "pruning.cluster_aware.twoaxis_red_target_weight": "cluster_aware_twoaxis_red_target_weight", + "pruning.cluster_aware.twoaxis_use_pid_red_target": "cluster_aware_twoaxis_use_pid_red_target", + "pruning.cluster_aware.twoaxis_halo_weight": "cluster_aware_twoaxis_halo_weight", # Pruning distribution safety caps "pruning.distribution": "pruning_distribution", "pruning.dependency_aware": "dependency_aware_pruning", diff --git a/src/alignment/experiments/base.py b/src/alignment/experiments/base.py index d9105496..f8b456c6 100644 --- a/src/alignment/experiments/base.py +++ b/src/alignment/experiments/base.py @@ -134,6 +134,10 @@ class ExperimentConfig: # If you explicitly want to reuse the local sampling scheme, set to "match" # (not recommended: it repeats the same image-level target across spatial samples). task_activation_samples: Optional[str] = None + # Optional control for target-directed metrics: + # - "none": use true task target + # - "batch": shuffle target values within each calibration batch + task_target_permutation: str = "none" spatial_samples_per_image: int = 16 # used when activation_samples="flatten_spatial" n_clusters: int = 4 synergy_target: str = "logit_margin" # logit_margin, correct_logit, logit_pc1 @@ -168,6 +172,18 @@ class ExperimentConfig: # - "quantile": Partition by composite score quartiles (no k-means) clustering_importance_mode: str = "geometric" + # Clustering feature set: which 3 features to use for k-means clustering + # - "internal": (first_metric, internal_redundancy, synergy) — original features + # - "pid": (task_MI, pid_redundancy_t, synergy) — PID target-directed features + clustering_feature_set: str = "internal" + + # Input MI proxy reference scale: + # mi_in_proxy = 0.5 * log(1 + signal_power / sigma0_sq) + # where signal_power = RQ * ||w||^2. Default reproduces legacy behavior. + mi_in_proxy_sigma_mode: str = "median" # median|mean|p75|p90|quantile|fixed + mi_in_proxy_sigma_fixed: float = 0.0 + mi_in_proxy_sigma_quantile: float = 50.0 + # Optional: compute per-channel loss proxy (Fisher/GN-style) on calibration data. compute_loss_proxy: bool = False loss_proxy_n_calibration: int = 1024 @@ -232,6 +248,23 @@ class ExperimentConfig: cluster_aware_late_gamma: float = 0.5 # Higher redundancy penalty late # Fraction of layers considered "early" cluster_aware_early_layer_frac: float = 0.3 + # Two-axis adaptive CAP (local IXY/Red/Syn + target TaskMI/PID-Red blend). + # Blend weight to target branch follows a depth sigmoid: + # mix(depth) = sigmoid(sharpness * (depth - depth_switch)) + # where depth in [0,1]. + cluster_aware_twoaxis_depth_switch: float = 0.5 + cluster_aware_twoaxis_depth_sharpness: float = 10.0 + # Local branch (IXY/internal Red/Syn) weights + cluster_aware_twoaxis_ixy_weight: float = 1.0 + cluster_aware_twoaxis_red_internal_weight: float = 0.25 + cluster_aware_twoaxis_syn_weight: float = 0.0 + # Target branch (TaskMI/PID-Red) weights + cluster_aware_twoaxis_early_task_weight: float = 0.05 + cluster_aware_twoaxis_late_task_weight: float = 0.35 + cluster_aware_twoaxis_red_target_weight: float = 0.0 + cluster_aware_twoaxis_use_pid_red_target: bool = True + # Halo contribution for the two-axis score. + cluster_aware_twoaxis_halo_weight: float = 0.3 # --------------------------------------------------------------------- # Generalized Taylor pruning (vision) @@ -298,6 +331,11 @@ class ExperimentConfig: # When enabled, per-layer prune counts are adjusted so the total number of # pruned channels matches round(target_sparsity * total_prunable_channels). pruning_enforce_exact_global_channel_budget: bool = False + # When True, use Taylor channel scores for computing per-layer pruning + # allocation (via the distribution manager), while the method's own scores + # are used for within-layer channel ranking. This decouples allocation + # (gradient-aware) from ranking (substitutability-aware). + hybrid_taylor_allocation: bool = False fine_tune_learning_rate: Optional[float] = None # Will default to learning_rate * 0.1 # Optional cap for post-pruning fine-tuning speed (useful for ImageNet-scale runs) # None => use the full training loader each epoch. diff --git a/src/alignment/experiments/cluster_experiments.py b/src/alignment/experiments/cluster_experiments.py index b138d5db..4bb9fb8c 100644 --- a/src/alignment/experiments/cluster_experiments.py +++ b/src/alignment/experiments/cluster_experiments.py @@ -176,6 +176,67 @@ def variance(self) -> np.ndarray: return np.clip(var, 1e-12, None) +def _maybe_permute_task_target(target: np.ndarray, mode: str, rng: np.random.Generator) -> np.ndarray: + """ + Optional target permutation control for task-directed metrics. + + mode: + - "none": no permutation + - "batch"/"shuffle": shuffle target values within each batch + """ + t = np.asarray(target, dtype=np.float64).reshape(-1) + m = str(mode or "none").strip().lower() + if m in {"none", "off", "false", "0"}: + return t + if t.size <= 1: + return t + if m in {"batch", "within_batch", "shuffle", "permute"}: + return t[rng.permutation(t.size)] + return t + + +def _mi_in_proxy_from_signal_power( + signal_power: np.ndarray, + *, + sigma_mode: str = "median", + sigma_fixed: float = 0.0, + sigma_quantile: float = 50.0, +) -> Tuple[np.ndarray, float]: + """ + Compute MI-in proxy with configurable reference noise level sigma0^2. + + sigma_mode: + - "median" (default), "mean", "p75", "p90", "quantile", "fixed" + """ + sp = np.asarray(signal_power, dtype=np.float64).reshape(-1) + sp = np.clip(sp, 0.0, None) + finite = sp[np.isfinite(sp)] + if finite.size == 0: + finite = np.array([1e-12], dtype=np.float64) + + mode = str(sigma_mode or "median").strip().lower() + if mode == "mean": + sigma0_sq = float(np.mean(finite)) + elif mode in {"p75", "q75"}: + sigma0_sq = float(np.percentile(finite, 75.0)) + elif mode in {"p90", "q90"}: + sigma0_sq = float(np.percentile(finite, 90.0)) + elif mode in {"quantile", "percentile"}: + q = float(np.clip(float(sigma_quantile), 1.0, 99.0)) + sigma0_sq = float(np.percentile(finite, q)) + elif mode == "fixed": + sigma0_sq = float(sigma_fixed) + else: + sigma0_sq = float(np.median(finite)) + + if not np.isfinite(sigma0_sq) or sigma0_sq <= 0.0: + sigma0_sq = float(np.median(np.clip(finite, 1e-12, None))) + sigma0_sq = max(sigma0_sq, 1e-12) + + mi_in_proxy = 0.5 * np.log1p(sp / sigma0_sq) + return mi_in_proxy.astype(np.float64), float(sigma0_sq) + + from .base import ExperimentConfig # --------------------------------------------------------------------- @@ -704,6 +765,7 @@ def _bn_for_conv_name(conv_name: str): task_mode = activation_mode samples_per_img = int(self.config.spatial_samples_per_image) samples_per_img = max(1, samples_per_img) + target_perm_mode = str(getattr(self.config, "task_target_permutation", "none") or "none") rng = np.random.default_rng(int(self.config.seed)) @@ -735,6 +797,7 @@ def _bn_for_conv_name(conv_name: str): mask[torch.arange(bsz, device=logits.device), y] = False max_incorrect = logits.masked_fill(~mask, float("-inf")).max(dim=1)[0] T_img = (correct_logits - max_incorrect).detach().cpu().numpy() # [B] + T_img = _maybe_permute_task_target(T_img, target_perm_mode, rng) # Update each layer accumulator using the captured activations for name, layer in self.layers: @@ -891,8 +954,17 @@ def _bn_for_conv_name(conv_name: str): # 1b) Input MI proxy (scale-sensitive): 0.5 * log(1 + RQ * ||w||^2 / sigma0^2) # We use a per-layer reference sigma0^2 to make the proxy comparable across depth. signal_power = (metrics["rq"] * weight_norm[:n_channels]).astype(np.float64) - sigma0_sq = float(np.median(signal_power)) + 1e-12 - metrics["mi_in_proxy"] = (0.5 * np.log1p(signal_power / sigma0_sq)).astype(np.float64) + sigma_mode = str(getattr(self.config, "mi_in_proxy_sigma_mode", "median") or "median") + sigma_fixed = float(getattr(self.config, "mi_in_proxy_sigma_fixed", 0.0) or 0.0) + sigma_quantile = float(getattr(self.config, "mi_in_proxy_sigma_quantile", 50.0) or 50.0) + mi_in_proxy, sigma0_sq = _mi_in_proxy_from_signal_power( + signal_power, + sigma_mode=sigma_mode, + sigma_fixed=sigma_fixed, + sigma_quantile=sigma_quantile, + ) + metrics["mi_in_proxy"] = mi_in_proxy + metrics["mi_in_proxy_sigma0_sq"] = np.full(n_channels, sigma0_sq, dtype=np.float64) # 2) Redundancy via Gaussian MI from correlations denom = np.sqrt(np.outer(var_y, var_y)) + 1e-12 @@ -917,6 +989,8 @@ def _bn_for_conv_name(conv_name: str): top_m = max(1, min(top_m, candidate_pool - 1)) synergy = np.zeros(n_channels, dtype=np.float64) + pid_redundancy_t = np.zeros(n_channels, dtype=np.float64) # PID redundancy about target + pid_unique_t = np.zeros(n_channels, dtype=np.float64) # PID unique info about target # Partner ordering by redundancy (Gaussian MI) on task-level pooled activations. denom_task = np.sqrt(np.outer(var_y_task, var_y_task)) + 1e-12 @@ -955,6 +1029,7 @@ def _bn_for_conv_name(conv_name: str): mi_i = float(mi_t[i]) syn_pairs: List[Tuple[float, int]] = [] + pid_red_pairs: List[float] = [] for j in cand: j = int(j) mi_j = float(mi_t[j]) @@ -967,12 +1042,17 @@ def _bn_for_conv_name(conv_name: str): cov_t_j=float(cov_ty_task[j]), cov_i_j=cov_i_j, ) - s = mi_joint - mi_i - mi_j + min(mi_i, mi_j) + r_t = min(mi_i, mi_j) # PID redundancy about target (MMI) + s = mi_joint - mi_i - mi_j + r_t syn_pairs.append((float(s), j)) + pid_red_pairs.append(float(r_t)) if syn_pairs: syn_pairs.sort(key=lambda x: x[0], reverse=True) synergy[i] = float(np.mean([s for (s, _j) in syn_pairs[:top_m]])) + if pid_red_pairs: + pid_redundancy_t[i] = float(np.mean(pid_red_pairs)) + pid_unique_t[i] = max(0.0, mi_i - pid_redundancy_t[i]) if collect_within and syn_idx is not None and syn_val is not None: top_edges = syn_pairs[:syn_k] if top_edges: @@ -980,6 +1060,8 @@ def _bn_for_conv_name(conv_name: str): syn_val[i, : len(top_edges)] = np.asarray([s for (s, _j) in top_edges], dtype=np.float32) metrics["synergy"] = synergy + metrics["pid_redundancy_t"] = pid_redundancy_t + metrics["pid_unique_t"] = pid_unique_t if collect_within and red_idx is not None and red_val is not None and syn_idx is not None and syn_val is not None: self._within_layer_neighbors[name] = { @@ -1168,6 +1250,7 @@ def run_clustering( run_ablation: Optional[bool] = None, first_metric: Optional[str] = None, clustering_importance_mode: Optional[str] = None, + clustering_feature_set: Optional[str] = None, ) -> Dict[str, Any]: """ Cluster channels in each layer. @@ -1184,6 +1267,10 @@ def run_clustering( - "score_augmented": k-means with importance score as extra feature - "importance_reassign": k-means geometry, reassign types by score - "quantile": Partition by composite score quartiles + clustering_feature_set: Override for clustering feature set. One of: + - "internal": (first_metric, redundancy, synergy) — original + - "pid": (task_MI, pid_redundancy_t, synergy) — target-directed PID + Uses config.clustering_feature_set if not specified. Returns: Dict with cluster results (and ablation results if enabled) @@ -1199,11 +1286,18 @@ def run_clustering( metric_key = "rq" metric_label = "RQ" + # Determine clustering feature set + feat_set = clustering_feature_set or getattr(self.config, "clustering_feature_set", "internal") + feat_set = str(feat_set).lower() + # Determine clustering importance mode c_mode = clustering_importance_mode or getattr(self.config, "clustering_importance_mode", "geometric") c_mode = str(c_mode).lower() - logger.info(f"Clustering channels using {metric_label} as first metric, mode={c_mode}...") + if feat_set == "pid": + logger.info(f"Clustering channels using PID features (task_MI, pid_redundancy_t, synergy), mode={c_mode}...") + else: + logger.info(f"Clustering channels using {metric_label} as first metric, mode={c_mode}...") run_ablation = run_ablation if run_ablation is not None else bool(self.config.run_metric_ablation) @@ -1221,19 +1315,34 @@ def run_clustering( gamma = float(getattr(self.config, "cluster_aware_gamma", 0.3)) for name, metrics in self.layer_metrics.items(): - # Get the first metric (RQ or I(X;Y)) - first_values = metrics.get(metric_key) - if first_values is None: - first_values = metrics.get("rq", np.ones(1)) - if first_metric == "ixy": - logger.warning(f" {name}: mi_in_proxy not available, falling back to RQ") + # Select clustering features based on feature set + if feat_set == "pid": + # PID features: (task_MI, pid_redundancy_t, synergy) + first_values = metrics.get("task_mi") + if first_values is None: + first_values = metrics.get(metric_key, np.ones(1)) + logger.warning(f" {name}: task_mi not available, falling back to {metric_label}") + clust_redundancy = metrics.get("pid_redundancy_t") + if clust_redundancy is None: + clust_redundancy = metrics.get("redundancy", np.zeros(1)) + logger.warning(f" {name}: pid_redundancy_t not available, falling back to internal redundancy") + clust_synergy = metrics.get("synergy", np.zeros(1)) + else: + # Internal features: (first_metric, redundancy, synergy) + first_values = metrics.get(metric_key) + if first_values is None: + first_values = metrics.get("rq", np.ones(1)) + if first_metric == "ixy": + logger.warning(f" {name}: mi_in_proxy not available, falling back to RQ") + clust_redundancy = metrics.get("redundancy", np.zeros(1)) + clust_synergy = metrics.get("synergy", np.zeros(1)) # Compute importance scores for non-geometric modes importance_scores = None if c_mode != "geometric": fv = np.asarray(first_values, dtype=np.float64).flatten() - rd = np.asarray(metrics.get("redundancy", np.zeros_like(fv)), dtype=np.float64).flatten() - sy = np.asarray(metrics.get("synergy", np.zeros_like(fv)), dtype=np.float64).flatten() + rd = np.asarray(clust_redundancy, dtype=np.float64).flatten() + sy = np.asarray(clust_synergy, dtype=np.float64).flatten() n = min(len(fv), len(rd), len(sy)) if n > 0: fv, rd, sy = fv[:n], rd[:n], sy[:n] @@ -1247,8 +1356,8 @@ def _n01(x): result = clusterer.fit( first_values, - metrics["redundancy"], - metrics["synergy"], + clust_redundancy, + clust_synergy, name, importance_scores=importance_scores, clustering_mode=c_mode, @@ -1262,6 +1371,7 @@ def _n01(x): "layer_name": name, "ablation_mode": "all", "first_metric": first_metric, + "feature_set": feat_set, "clustering_mode": c_mode, } logger.info(f" {name}: silhouette={result.silhouette:.3f}, types={result.type_counts}") @@ -1928,7 +2038,15 @@ def _checkpoint_pruning_results() -> None: selection_mode = self._selection_mode_for_method(prune_method) try: - if prune_method.startswith("cluster_aware") or prune_method in ("cap_ixy", "composite_ixy"): + if prune_method.startswith("cluster_aware") or prune_method in ( + "cap_ixy", + "composite_ixy", + "composite_twoaxis", + "composite_twoaxis_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + ): pipeline_result = self._run_cluster_aware_pruning( model_copy, layer_modules=layer_modules, @@ -2863,9 +2981,13 @@ def _compute_layer_scores_for_method(self, method: str, model: nn.Module) -> Dic "lp_low": "loss_proxy", "lp_high": "loss_proxy", } - for name, layer in modules.items(): + layer_names = list(modules.keys()) + depth_den = max(1, len(layer_names) - 1) + for idx, name in enumerate(layer_names): + layer = modules.get(name) if layer is None or not hasattr(layer, "weight"): continue + depth_frac = float(idx) / float(depth_den) weight = layer.weight device = weight.device metrics = self.layer_metrics.get(name, {}) @@ -2990,13 +3112,18 @@ def _compute_layer_scores_for_method(self, method: str, model: nn.Module) -> Dic elif method in { "composite", "composite_pos_red", + "composite_twoaxis", + "composite_twoaxis_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", "rq_minus_red", "rq_plus_red", "magnitude_plus_rq", "magnitude_minus_red", "magnitude_plus_red", }: - comp = self._compute_composite_metric(method, metrics, layer) + comp = self._compute_composite_metric(method, metrics, layer, depth_frac=depth_frac) if comp is not None: layer_scores[name] = comp.to(device) # ------------------------------------------------------------------ @@ -3163,12 +3290,21 @@ def _compute_layer_scores_for_method(self, method: str, model: nn.Module) -> Dic return {} return layer_scores - def _compute_composite_metric(self, method: str, metrics: Dict[str, np.ndarray], layer: nn.Module) -> Optional[torch.Tensor]: + def _compute_composite_metric( + self, + method: str, + metrics: Dict[str, np.ndarray], + layer: nn.Module, + depth_frac: float = 0.5, + ) -> Optional[torch.Tensor]: rq = np.log(np.clip(metrics.get("rq", np.ones(layer.weight.shape[0])), 1e-10, None)) redundancy = metrics.get("redundancy", np.zeros_like(rq)) synergy = metrics.get("synergy", np.zeros_like(rq)) # I(X;Y) - mutual information proxy (already in log scale from computation) ixy = metrics.get("mi_in_proxy", rq) # fallback to rq if not available + # PID target-directed metrics + task_mi = metrics.get("task_mi", ixy) # fallback to ixy + pid_red_t = metrics.get("pid_redundancy_t", redundancy) # fallback to internal redundancy def normalize(arr: np.ndarray) -> np.ndarray: if arr.size == 0: @@ -3183,6 +3319,8 @@ def normalize(arr: np.ndarray) -> np.ndarray: ixy_norm = normalize(ixy) red_norm = normalize(redundancy) syn_norm = normalize(synergy) + task_mi_norm = normalize(task_mi) + pid_red_t_norm = normalize(pid_red_t) if method == "composite": scores = rq_norm + 0.5 * syn_norm - 0.3 * red_norm @@ -3194,6 +3332,40 @@ def normalize(arr: np.ndarray) -> np.ndarray: scores = ixy_norm + 0.5 * syn_norm - 0.3 * red_norm elif method == "composite_ixy_pos_red": scores = ixy_norm + 0.5 * syn_norm + 0.3 * red_norm + # Depth-adaptive two-axis composite (score-only; no halo term) + elif method in {"composite_twoaxis", "composite_twoaxis_ixy"}: + # Local axis: first metric (RQ or IXY), internal redundancy, synergy. + # Target axis: task-directed MI and (optionally) target-directed PID redundancy. + use_ixy = method.endswith("_ixy") + first_norm = ixy_norm if use_ixy else rq_norm + red_target_norm = pid_red_t_norm if bool(self.config.cluster_aware_twoaxis_use_pid_red_target) else red_norm + + switch = float(self.config.cluster_aware_twoaxis_depth_switch) + sharp = float(self.config.cluster_aware_twoaxis_depth_sharpness) + early_task_w = float(self.config.cluster_aware_twoaxis_early_task_weight) + late_task_w = float(self.config.cluster_aware_twoaxis_late_task_weight) + task_gate = 1.0 / (1.0 + np.exp(-np.clip((float(depth_frac) - switch) * sharp, -60.0, 60.0))) + task_w = early_task_w + task_gate * (late_task_w - early_task_w) + + local = ( + float(self.config.cluster_aware_twoaxis_ixy_weight) * first_norm + + float(self.config.cluster_aware_twoaxis_syn_weight) * syn_norm + - float(self.config.cluster_aware_twoaxis_red_internal_weight) * red_norm + ) + target = task_w * (task_mi_norm - float(self.config.cluster_aware_twoaxis_red_target_weight) * red_target_norm) + scores = local + target + # PID target-directed composite variants + elif method == "composite_pid": + # Score = α·task_MI + β·Syn - γ·pid_redundancy_T + scores = task_mi_norm + 0.5 * syn_norm - 0.3 * pid_red_t_norm + elif method == "composite_pid_no_red": + # Score = task_MI only (since pid_red_t ≈ task_MI, check unique_T matters) + scores = task_mi_norm + elif method == "composite_pid_unique": + # Score based on unique target information: unique_T + synergy + unique_t = metrics.get("pid_unique_t", task_mi) + unique_t_norm = normalize(unique_t) + scores = unique_t_norm + 0.5 * syn_norm elif method == "ixy_minus_red": scores = ixy_norm - 0.5 * red_norm elif method == "ixy_plus_red": @@ -3223,6 +3395,116 @@ def normalize(arr: np.ndarray) -> np.ndarray: return torch.as_tensor(scores, dtype=torch.float32) + def _get_pairwise_matrices( + self, + layer_name: str, + n_channels: int, + max_samples: int = 1000, + num_syn_partners: int = 10, + ) -> Optional[tuple]: + """Compute pairwise R and S matrices for a layer. + + Collects activations from the calibration loader, computes: + R(i,j) = -0.5 * log(1 - rho(Y_i, Y_j)^2) — pairwise redundancy + S(T; Y_i, Y_j) = I(T; Y_i, Y_j) - I(T; Y_i) - I(T; Y_j) + min(...) — pairwise synergy + + Results are cached on self._pairwise_cache. + """ + if not hasattr(self, "_pairwise_cache"): + self._pairwise_cache: Dict[str, tuple] = {} + if layer_name in self._pairwise_cache: + return self._pairwise_cache[layer_name] + + import torch as _th + + device = next(self.model.parameters()).device if self.model is not None else "cpu" + module_map = dict(self.model.named_modules()) + target_module = module_map.get(layer_name) + if target_module is None: + return None + + # Collect GAP activations + logits via forward hook + acts_list: list = [] + logits_list: list = [] + labels_list: list = [] + + def _hook(_mod, _inp, out): + o = out.detach() + if o.ndim == 4: + o = o.mean(dim=(2, 3)) + acts_list.append(o.cpu()) + + h = target_module.register_forward_hook(_hook) + n_seen = 0 + try: + self.model.eval() + with _th.no_grad(): + for x, y in self._get_calibration_loader(): + if n_seen >= max_samples: + break + logits = self.model(x.to(device)) + logits_list.append(logits.cpu()) + labels_list.append(y) + n_seen += int(x.shape[0]) + finally: + h.remove() + + if not acts_list: + return None + + acts = _th.cat(acts_list, 0).numpy()[:, :n_channels] + logits = _th.cat(logits_list, 0).numpy() + gt_labels = _th.cat(labels_list, 0).numpy() + B, N = acts.shape + eps = 1e-8 + + # Pairwise R + corr = np.corrcoef(acts.T) + corr = np.nan_to_num(corr, nan=0.0) + corr = np.clip(corr, -0.999, 0.999) + R = -0.5 * np.log(1 - corr**2) + np.fill_diagonal(R, 0) + + # Pairwise S (sparse, top-K partners) + correct_logits = logits[np.arange(B), gt_labels] + mask = np.ones_like(logits, dtype=bool) + mask[np.arange(B), gt_labels] = False + max_incorrect = np.where(mask, logits, -np.inf).max(axis=1) + T = correct_logits - max_incorrect + + T_c = T - T.mean() + T_std = T_c.std() + eps + acts_c = acts - acts.mean(axis=0) + acts_std = acts_c.std(axis=0) + eps + rho_ind = (T_c[:, None] * acts_c).mean(axis=0) / (T_std * acts_std) + rho_ind = np.clip(rho_ind, -1 + eps, 1 - eps) + mi_ind = np.maximum(0, -0.5 * np.log(1 - rho_ind**2)) + + S = np.zeros((N, N)) + kp = min(num_syn_partners, N - 1) + for i in range(N): + partners = np.argsort(-mi_ind)[: kp + 1] + partners = [j for j in partners if j != i][:kp] + for j in partners: + if S[i, j] != 0: + continue + joint = np.column_stack([T, acts[:, i], acts[:, j]]) + joint_c = joint - joint.mean(axis=0) + cov = (joint_c.T @ joint_c) / (B - 1 + eps) + eps * np.eye(3) + var_T = cov[0, 0] + cov_Y = cov[1:, 1:] + det_all = np.linalg.det(cov) + det_Y = np.linalg.det(cov_Y) + if det_all <= 0 or det_Y <= 0 or var_T <= 0: + continue + mi_joint = max(0, 0.5 * np.log(var_T * det_Y / det_all)) + s = mi_joint - mi_ind[i] - mi_ind[j] + min(mi_ind[i], mi_ind[j]) + S[i, j] = s + S[j, i] = s + + self._pairwise_cache[layer_name] = (R, S) + return (R, S) + def _compute_halo_syn_proxy( self, *, @@ -3445,23 +3727,44 @@ def _run_cluster_aware_pruning( method_name = str(method).lower() # Normalize CAP aliases so variant logic below can be shared between - # RQ-first and I(X;Y)-first versions (e.g., *_ixy methods). + # RQ-first, I(X;Y)-first, and PID-first versions (e.g., *_ixy, *_pid methods). base_method = method_name if base_method in ("cap_ixy",): base_method = "cluster_aware" + # Strip _ixy suffix/infix if base_method.endswith("_ixy"): base_method = base_method[:-4] if "_ixy_" in base_method: base_method = base_method.replace("_ixy_", "_") + # Strip _pid suffix/infix (for CAP variants like cluster_aware_stratified_pid) + if base_method.endswith("_pid"): + base_method = base_method[:-4] + if "_pid_" in base_method: + base_method = base_method.replace("_pid_", "_") # Flag to track whether we should use I(X;Y) instead of RQ use_ixy_metric = method_name.endswith("_ixy") or "_ixy_" in method_name + # Flag for PID target-directed metric mode. + # PID methods use task_MI instead of RQ/IXY, and pid_redundancy_t instead of internal redundancy. + use_pid_metric = method_name.startswith("composite_pid") or "_pid_" in method_name or method_name.endswith("_pid") + if use_pid_metric: + use_ixy_metric = False # PID uses task_mi directly, not IXY + # Normalize PID composite variants to base_method="composite" + if base_method.startswith("composite_pid"): + base_method = "composite" + # Detect importance-aware clustering mode from method name. # E.g., "cluster_aware_importance_gradient_weighted_ixy" -> clustering_override = "importance_reassign" # The clustering suffix is removed from base_method so variant dispatch works normally. _clustering_override: Optional[str] = None for _csuffix, _cmode in [ + ("_spectral_pid_rs", "spectral_pid_rs"), + ("_spectral_pid_rt", "spectral_pid_rt"), + ("_spectral_pid_s", "spectral_pid_s"), + ("_spectral_rs", "spectral_rs"), + ("_spectral_r", "spectral_r"), + ("_spectral_s", "spectral_s"), ("_importance", "importance_reassign"), ("_quantile", "quantile"), ("_score_augmented", "score_augmented"), @@ -3478,6 +3781,7 @@ def _run_cluster_aware_pruning( "cluster_aware_stratified", "cluster_aware_stratified_nohalo", "cluster_aware_region_stratified", + "cluster_aware_stratified_twoaxis_adaptive", }: # Label-free variants: avoid type-priority heuristics and treat clusters as structure only. cfg.target_redundant = False @@ -3505,7 +3809,7 @@ def _run_cluster_aware_pruning( # Use I(X;Y) instead of RQ in the CAP score # Score_i = α·log(I(X;Y)_i) + β·Syn_i - γ·Red_i + λ·HaloSyn_i use_ixy_metric = True - elif base_method == "composite": + elif base_method in {"composite", "composite_twoaxis"}: # Score-only baseline (no halo term, no type constraints). # This branch is primarily used by "composite_ixy" so the # first metric can be switched to I(X;Y) while keeping the @@ -3637,9 +3941,22 @@ def _run_cluster_aware_pruning( except Exception: pass - # Prepare metrics for the pruner - optionally use I(X;Y) instead of RQ + # Prepare metrics for the pruner - optionally use I(X;Y) or PID metrics instead of RQ pruner_metrics = pre_metrics.copy() if hasattr(pre_metrics, "copy") else dict(pre_metrics) - if use_ixy_metric: + if use_pid_metric: + # Replace RQ with task_MI and redundancy with pid_redundancy_t for PID-based scoring + task_mi_values = pre_metrics.get("task_mi") + pid_red_values = pre_metrics.get("pid_redundancy_t") + if task_mi_values is not None: + pruner_metrics = dict(pre_metrics) + pruner_metrics["rq"] = task_mi_values # Score formula uses "rq" key + logger.debug(f" {layer_name}: Using task_MI instead of RQ for PID score") + else: + logger.warning(f" {layer_name}: task_mi not available, using RQ") + if pid_red_values is not None: + pruner_metrics["redundancy"] = pid_red_values + logger.debug(f" {layer_name}: Using pid_redundancy_t instead of internal redundancy") + elif use_ixy_metric: # Replace RQ with I(X;Y) (mi_in_proxy) for the CAP score ixy_values = pre_metrics.get("mi_in_proxy") if ixy_values is not None: @@ -3680,7 +3997,71 @@ def _run_cluster_aware_pruning( # Importance-aware clustering overrides: re-cluster with importance-based # type assignment at pruning time (uses the same k-means geometry but # reassigns types by composite score, or uses quantile partitioning). - if _clustering_override is not None: + # Spectral overrides: use pairwise R/S matrices for spectral clustering. + if _clustering_override is not None and _clustering_override.startswith("spectral"): + # Spectral clustering on pairwise R and/or S affinity matrices + try: + _is_pid_spectral = "pid" in _clustering_override + _pairwise = self._get_pairwise_matrices(layer_name, n_channels) + if _pairwise is not None: + _R_internal, _S_target = _pairwise + from sklearn.cluster import SpectralClustering as _SC + + _k = min(int(cfg.n_clusters), n_channels - 1) + + if _is_pid_spectral: + # PID pairwise matrices: + # R_T(i,j) = min(I(T;Y_i), I(T;Y_j)) — target-directed redundancy + # S_T(i,j) = already target-directed (same _S_target) + _tmi = np.asarray(pre_metrics.get("task_mi", np.zeros(n_channels)), dtype=np.float64).reshape(-1)[:n_channels] + _R_T = np.minimum(_tmi[:, None], _tmi[None, :]) + np.fill_diagonal(_R_T, 0) + + if _clustering_override == "spectral_pid_rt": + _aff = np.maximum(np.nan_to_num(_R_T, nan=0.0), 0) + elif _clustering_override == "spectral_pid_s": + _aff = np.maximum(np.nan_to_num(np.abs(_S_target), nan=0.0), 0) + else: # spectral_pid_rs + _RT_n = _R_T / (np.nanmax(_R_T) + 1e-12) + _S_abs = np.abs(_S_target) + _S_n = _S_abs / (np.nanmax(_S_abs) + 1e-12) + _aff = np.maximum(np.nan_to_num(0.5 * _RT_n + 0.5 * _S_n, nan=0.0), 0) + else: + # Internal pairwise matrices + if _clustering_override == "spectral_r": + _aff = np.maximum(np.nan_to_num(_R_internal, nan=0.0), 0) + elif _clustering_override == "spectral_s": + _aff = np.maximum(np.nan_to_num(np.abs(_S_target), nan=0.0), 0) + else: # spectral_rs + _R_n = _R_internal / (np.nanmax(_R_internal) + 1e-12) + _S_abs = np.abs(_S_target) + _S_n = _S_abs / (np.nanmax(_S_abs) + 1e-12) + _aff = np.maximum(np.nan_to_num(0.5 * _R_n + 0.5 * _S_n, nan=0.0), 0) + + np.fill_diagonal(_aff, 0) + if _aff.max() > 0 and _k >= 2: + _sc = _SC( + n_clusters=_k, + affinity="precomputed", + random_state=int(self.config.seed), + n_init=10, + assign_labels="kmeans", + ) + _spec_labels = _sc.fit_predict(_aff) + pruner_labels = _spec_labels[:n_channels] + # Build proportional type mapping (no semantic types) + pruner_type_mapping = {int(c): f"cluster_{c}" for c in np.unique(pruner_labels)} + logger.debug( + " %s: spectral(%s) k=%d, cluster sizes=%s", + layer_name, + _clustering_override, + _k, + [int(np.sum(pruner_labels == c)) for c in np.unique(pruner_labels)], + ) + except Exception as e: + logger.warning(" %s: spectral clustering failed (%s), using default", layer_name, e) + + elif _clustering_override is not None: try: _rq = np.asarray(pruner_metrics.get("rq", []), dtype=np.float64).reshape(-1)[:n_channels] _red = np.asarray(pruner_metrics.get("redundancy", []), dtype=np.float64).reshape(-1)[:n_channels] @@ -3719,8 +4100,17 @@ def _n01(x): "cluster_aware_stratified", "cluster_aware_stratified_nohalo", "cluster_aware_region_stratified", + "cluster_aware_stratified_twoaxis_adaptive", }: PrunerCls = ClusterAwareStratifiedPruning + # Spectral clustering modes use stratified (proportional) pruning + # since analysis showed proportional budgets work best with spectral clusters. + # Disable type-based constraints since spectral clusters don't have semantic labels. + if _clustering_override is not None and _clustering_override.startswith("spectral"): + PrunerCls = ClusterAwareStratifiedPruning + cfg.target_redundant = False + cfg.synergy_pair_constraint = False + cfg.protect_critical_frac = 1.0 pruner = PrunerCls( cfg, @@ -3957,6 +4347,89 @@ def _norm(x): scores = torch.from_numpy(score_np).float().to(scores.device) + # ------------------------------------------------------------------ + # OPTION 7: two-axis adaptive score + # Local axis (RQ/IXY, internal redundancy, synergy) + target axis + # (task MI, optional target redundancy), with depth-adaptive blending. + # For composite_twoaxis* this acts as score-only (no halo term). + # ------------------------------------------------------------------ + elif base_method in { + "cluster_aware_twoaxis_adaptive", + "cluster_aware_stratified_twoaxis_adaptive", + "composite_twoaxis", + }: + lm = pruner_metrics + first_metric = np.asarray( + lm.get("rq", pre_metrics.get("rq", [])), + dtype=np.float64, + ).reshape(-1) + red_internal = np.asarray( + lm.get("redundancy", pre_metrics.get("redundancy", [])), + dtype=np.float64, + ).reshape(-1) + syn = np.asarray( + lm.get("synergy", pre_metrics.get("synergy", [])), + dtype=np.float64, + ).reshape(-1) + task_mi = np.asarray( + pre_metrics.get("task_mi", pre_metrics.get("mi_in_proxy", first_metric)), + dtype=np.float64, + ).reshape(-1) + red_target_pid = np.asarray( + pre_metrics.get("pid_redundancy_t", red_internal), + dtype=np.float64, + ).reshape(-1) + + n = min( + n_channels, + len(first_metric), + len(red_internal), + len(syn), + len(task_mi), + len(red_target_pid), + len(halo_syn), + ) + if n > 0: + first_metric = first_metric[:n] + red_internal = red_internal[:n] + syn = syn[:n] + task_mi = task_mi[:n] + red_target_pid = red_target_pid[:n] + halo_n = np.asarray(halo_syn[:n], dtype=np.float64) + + if not (use_ixy_metric or use_pid_metric): + # Keep RQ numerically stable and comparable to prior CAP scoring. + first_metric = np.log(np.clip(first_metric, 1e-10, None)) + + def _n01(x): + x = np.asarray(x, dtype=np.float64) + lo, hi = x.min(), x.max() + if hi - lo < 1e-12: + return np.zeros_like(x) + return (x - lo) / (hi - lo) + + switch = float(self.config.cluster_aware_twoaxis_depth_switch) + sharp = float(self.config.cluster_aware_twoaxis_depth_sharpness) + gate = 1.0 / (1.0 + np.exp(-np.clip((float(depth_frac) - switch) * sharp, -60.0, 60.0))) + early_task_w = float(self.config.cluster_aware_twoaxis_early_task_weight) + late_task_w = float(self.config.cluster_aware_twoaxis_late_task_weight) + task_w = early_task_w + gate * (late_task_w - early_task_w) + + red_target = red_target_pid if bool(self.config.cluster_aware_twoaxis_use_pid_red_target) else red_internal + + local = ( + float(self.config.cluster_aware_twoaxis_ixy_weight) * _n01(first_metric) + + float(self.config.cluster_aware_twoaxis_syn_weight) * _n01(syn) + - float(self.config.cluster_aware_twoaxis_red_internal_weight) * _n01(red_internal) + ) + target = task_w * (_n01(task_mi) - float(self.config.cluster_aware_twoaxis_red_target_weight) * _n01(red_target)) + + score_np = local + target + if base_method != "composite_twoaxis": + score_np = score_np + float(self.config.cluster_aware_twoaxis_halo_weight) * _n01(halo_n) + + scores = torch.from_numpy(score_np).float().to(scores.device) + layer_scores[layer_name] = scores.detach() layer_pruners[layer_name] = pruner @@ -3969,6 +4442,32 @@ def _norm(x): } # Compute per-layer amounts using the shared distribution manager. + # + # Hybrid Taylor allocation: when enabled, use Taylor channel scores for + # the *allocation* step (deciding how many channels each layer loses) + # while keeping the method's own IXY/CAP scores for the *ranking* step + # (deciding which channels to prune within each layer). + allocation_scores = layer_scores + if getattr(self.config, "hybrid_taylor_allocation", False) and distribution != "uniform": + if "taylor" not in self._pruning_score_cache: + self._pruning_score_cache["taylor"] = self._compute_taylor_channel_scores(self.model) + taylor_cache = self._pruning_score_cache.get("taylor", {}) + taylor_alloc: Dict[str, "torch.Tensor"] = {} + n_taylor = 0 + for nm in layer_scores: + if nm in taylor_cache: + taylor_alloc[nm] = taylor_cache[nm] + n_taylor += 1 + else: + taylor_alloc[nm] = layer_scores[nm] + if taylor_alloc: + allocation_scores = taylor_alloc + logger.info( + "Hybrid Taylor allocation: using Taylor scores for %d/%d layers", + n_taylor, + len(layer_scores), + ) + try: from ..pruning.distribution import PruningDistributionManager @@ -3980,8 +4479,8 @@ def _norm(x): max_per_layer_sparsity_cap=float(self.config.pruning_max_per_layer_sparsity_cap), ) # Only include layers we actually scored - scored_names = [nm for nm in layer_names_all if nm in layer_scores] - per_layer_amounts = manager.compute_distribution(model, scored_names, layer_scores=layer_scores) + scored_names = [nm for nm in layer_names_all if nm in allocation_scores] + per_layer_amounts = manager.compute_distribution(model, scored_names, layer_scores=allocation_scores) except Exception as exc: logger.warning( "Cluster-aware pruning: failed to compute distribution '%s' (%s); falling back to uniform", diff --git a/src/alignment/pruning/strategies/generalized_taylor.py b/src/alignment/pruning/strategies/generalized_taylor.py index 5c432b88..f5f3a5d1 100644 --- a/src/alignment/pruning/strategies/generalized_taylor.py +++ b/src/alignment/pruning/strategies/generalized_taylor.py @@ -256,13 +256,22 @@ def compute_importance_scores( elif variant == "redundancy_discounted_taylor": # Taylor / (1 + β·redundancy): discount redundant channels + # Layer-normalized: rescale to preserve layer-level mean. beta = self.config.redundancy_discount_beta - scores = taylor_norm / (1 + beta * red_norm) + modulated = taylor_norm / (1 + beta * red_norm) + orig_mean = float(np.mean(taylor_norm)) if taylor_norm.size > 0 else 1.0 + mod_mean = float(np.mean(modulated)) if modulated.size > 0 else 1.0 + scores = modulated * (orig_mean / mod_mean) if mod_mean > 1e-12 else modulated elif variant == "synergy_boosted_taylor": # Taylor × (1 + γ·synergy): boost synergistic channels + # Layer-normalized: rescale to preserve layer-level mean so that + # inter-layer calibration is maintained under global_threshold. gamma = self.config.synergy_boost_gamma - scores = taylor_norm * (1 + gamma * syn_norm) + modulated = taylor_norm * (1 + gamma * syn_norm) + orig_mean = float(np.mean(taylor_norm)) if taylor_norm.size > 0 else 1.0 + mod_mean = float(np.mean(modulated)) if modulated.size > 0 else 1.0 + scores = modulated * (orig_mean / mod_mean) if mod_mean > 1e-12 else modulated elif variant == "structural_taylor": # |∂L/∂a| × structural_score @@ -328,10 +337,18 @@ def compute_importance_scores( elif variant == "mi_taylor": # Taylor × MI(channel, task) - scores = taylor_norm * (mi_norm + float(self.config.structural_eps)) + # Layer-normalized: rescale to preserve layer-level mean. + modulated = taylor_norm * (mi_norm + float(self.config.structural_eps)) + orig_mean = float(np.mean(taylor_norm)) if taylor_norm.size > 0 else 1.0 + mod_mean = float(np.mean(modulated)) if modulated.size > 0 else 1.0 + scores = modulated * (orig_mean / mod_mean) if mod_mean > 1e-12 else modulated elif variant == "cluster_type_taylor": # Taylor × type_multiplier based on cluster membership + # Layer-normalized: after multiplying, rescale so that the layer-level + # mean matches the original Taylor mean. This preserves Taylor's + # inter-layer calibration (critical for global_threshold distribution) + # while changing only the within-layer ranking. clusters = self.precomputed_clusters labels = np.asarray(clusters.get("labels", np.zeros(n_channels, dtype=int)))[:n_channels] type_mapping = clusters.get("type_mapping", {}) @@ -351,7 +368,15 @@ def compute_importance_scores( mask = labels == cluster_id multipliers[mask] = mult - scores = taylor_norm * multipliers + modulated = taylor_norm * multipliers + + # Rescale to preserve layer-level mean (inter-layer calibration) + orig_mean = float(np.mean(taylor_norm)) if taylor_norm.size > 0 else 1.0 + mod_mean = float(np.mean(modulated)) if modulated.size > 0 else 1.0 + if mod_mean > 1e-12: + scores = modulated * (orig_mean / mod_mean) + else: + scores = modulated elif variant == "full_generalized": # |∇L|^α × |a|^β × f(metrics) diff --git a/tests/unit/metrics/test_neurips_controls.py b/tests/unit/metrics/test_neurips_controls.py new file mode 100644 index 00000000..c353066e --- /dev/null +++ b/tests/unit/metrics/test_neurips_controls.py @@ -0,0 +1,40 @@ +""" +Unit tests for NeurIPS control knobs used in cluster-analysis experiments. +""" + +import numpy as np + +from alignment.experiments.cluster_experiments import _maybe_permute_task_target, _mi_in_proxy_from_signal_power + + +def test_target_permutation_none_is_identity(): + rng = np.random.default_rng(7) + t = np.array([0.1, 0.3, 0.2, -0.5], dtype=np.float64) + out = _maybe_permute_task_target(t, "none", rng) + np.testing.assert_allclose(out, t) + + +def test_target_permutation_batch_preserves_multiset(): + rng = np.random.default_rng(11) + t = np.array([0.1, 0.3, 0.2, -0.5, 0.9], dtype=np.float64) + out = _maybe_permute_task_target(t, "batch", rng) + assert out.shape == t.shape + assert sorted(out.tolist()) == sorted(t.tolist()) + + +def test_mi_in_proxy_default_uses_median_reference(): + signal_power = np.array([1.0, 2.0, 4.0, 8.0], dtype=np.float64) + mi, sigma0 = _mi_in_proxy_from_signal_power(signal_power, sigma_mode="median") + expected_sigma0 = float(np.median(signal_power)) + expected = 0.5 * np.log1p(signal_power / expected_sigma0) + assert abs(sigma0 - expected_sigma0) < 1e-12 + np.testing.assert_allclose(mi, expected, rtol=1e-10, atol=1e-12) + + +def test_mi_in_proxy_sigma_modes_change_scale(): + signal_power = np.array([1.0, 2.0, 4.0, 16.0, 32.0], dtype=np.float64) + mi_med, sigma_med = _mi_in_proxy_from_signal_power(signal_power, sigma_mode="median") + mi_p90, sigma_p90 = _mi_in_proxy_from_signal_power(signal_power, sigma_mode="p90") + + assert sigma_p90 > sigma_med + assert np.mean(mi_p90) < np.mean(mi_med) diff --git a/tests/unit/test_experiments.py b/tests/unit/test_experiments.py index cacda495..263b2da4 100644 --- a/tests/unit/test_experiments.py +++ b/tests/unit/test_experiments.py @@ -24,6 +24,8 @@ def test_default_values(self): config = ExperimentConfig(name="test") assert config.experiment_type == "alignment_analysis" assert config.seed == 42 + assert config.task_target_permutation == "none" + assert config.mi_in_proxy_sigma_mode == "median" class TestPruningConfig: From 76ed7d991c3e1bc86df853124618056160d8754d Mon Sep 17 00:00:00 2001 From: Houman Safaai Date: Sat, 18 Apr 2026 14:58:24 -0400 Subject: [PATCH 2/7] add docs/papers.md multi-paper index Track the framework-level public index of papers built on this codebase. Includes the two-axes NeurIPS 2026 manuscript and guidance for adding new papers. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/papers.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 docs/papers.md diff --git a/docs/papers.md b/docs/papers.md new file mode 100644 index 00000000..091dfd2a --- /dev/null +++ b/docs/papers.md @@ -0,0 +1,28 @@ +# Papers built on this framework + +This framework underpins several research projects. Each paper has its own +companion repository with LaTeX source, paper-specific analysis scripts, and +reproducibility instructions. Paper repositories depend on a pinned version +of this framework; the recommended workflow is to clone both, checkout the +framework at the pinned ref, and follow the paper repo's `reproduce.md`. + +| Paper | Status | Paper repo | Framework ref | Year | +|-------|--------|------------|---------------|------| +| *Relevance Is Not Replaceability: Orthogonal Axes of Channel Information in Vision Networks* | manuscript companion repo | [KempnerInstitute/alignment_notes](https://github.com/KempnerInstitute/alignment_notes) | `ca438bd1419849775a08d366416486ba2c03ccdc` | 2026 | + +## Adding a new paper + +When starting a new paper that uses this framework: + +1. Create a dedicated git repo for the paper under the Kempner org (e.g. + `KempnerInstitute/`). +2. Commit the LaTeX source, figure-generation scripts, paper-specific analysis + scripts, and a `README.md`, `reproduce.md`, `CITATION.cff`, and + `pinned_commit.txt` in that repo. +3. The paper repo may live under `drafts//` inside this + framework repo's working tree — `drafts/` is gitignored here, so it won't + pollute the framework repo. Each paper repo should have its own `.gitignore`. +4. Before a public release, cut a tag in *this* framework repo (e.g. + `neurips2026--v1`) that matches the framework commit the paper + used, and record the full SHA in the paper repo's `pinned_commit.txt`. +5. Add a row to the table above. From 9a13483619811e288ec7d8d8fcfc3447f8e4af48 Mon Sep 17 00:00:00 2001 From: Houman Safaai Date: Sat, 18 Apr 2026 15:24:58 -0400 Subject: [PATCH 3/7] update configs --- configs/prune_llm/README.md | 13 +- ...ma3_70b_scale_benchmarks_50_papersafe.yaml | 146 ++++++ .../prune_llm/llama3_70b_scale_mechanism.yaml | 126 +++++ .../llama3_70b_scale_pruning_curves.yaml | 137 ++++++ .../llama3_70b_scale_sparsegpt_curves.yaml | 92 ++++ ...tiny_imagenet100_locked_pilot_seed123.yaml | 249 ++++++++++ ..._tiny_imagenet100_locked_pilot_seed42.yaml | 249 ++++++++++ ...tiny_imagenet100_locked_pilot_seed456.yaml | 249 ++++++++++ ...vnext_tiny_imagenet100_struct_seed123.yaml | 213 +++++++++ ...nvnext_tiny_imagenet100_struct_seed42.yaml | 213 +++++++++ ...vnext_tiny_imagenet100_struct_seed456.yaml | 213 +++++++++ .../convnext_tiny_imagenet100_smoke.yaml | 204 ++++++++ ...t_imagenet100_locked_breadth_seed1011.yaml | 305 ++++++++++++ ...et_imagenet100_locked_breadth_seed123.yaml | 305 ++++++++++++ ...net_imagenet100_locked_breadth_seed42.yaml | 305 ++++++++++++ ...et_imagenet100_locked_breadth_seed456.yaml | 305 ++++++++++++ ...et_imagenet100_locked_breadth_seed789.yaml | 305 ++++++++++++ ...netv2_cifar10_locked_breadth_seed1011.yaml | 325 +++++++++++++ ...enetv2_cifar10_locked_breadth_seed123.yaml | 325 +++++++++++++ ...lenetv2_cifar10_locked_breadth_seed42.yaml | 325 +++++++++++++ ...enetv2_cifar10_locked_breadth_seed456.yaml | 325 +++++++++++++ ...enetv2_cifar10_locked_breadth_seed789.yaml | 325 +++++++++++++ ...2_imagenet100_locked_breadth_seed1011.yaml | 328 +++++++++++++ ...v2_imagenet100_locked_breadth_seed123.yaml | 328 +++++++++++++ ...tv2_imagenet100_locked_breadth_seed42.yaml | 328 +++++++++++++ ...v2_imagenet100_locked_breadth_seed456.yaml | 328 +++++++++++++ ...v2_imagenet100_locked_breadth_seed789.yaml | 328 +++++++++++++ ..._tinyimagenet_locked_breadth_seed1011.yaml | 332 +++++++++++++ ...2_tinyimagenet_locked_breadth_seed123.yaml | 332 +++++++++++++ ...v2_tinyimagenet_locked_breadth_seed42.yaml | 332 +++++++++++++ ...2_tinyimagenet_locked_breadth_seed456.yaml | 332 +++++++++++++ ...2_tinyimagenet_locked_breadth_seed789.yaml | 332 +++++++++++++ ...net18_cifar10_locked_breadth_seed1011.yaml | 435 ++++++++++++++++++ ...snet18_cifar10_locked_breadth_seed123.yaml | 435 ++++++++++++++++++ ...esnet18_cifar10_locked_breadth_seed42.yaml | 435 ++++++++++++++++++ ...snet18_cifar10_locked_breadth_seed456.yaml | 435 ++++++++++++++++++ ...snet18_cifar10_locked_breadth_seed789.yaml | 435 ++++++++++++++++++ ...8_imagenet100_locked_breadth_seed1011.yaml | 328 +++++++++++++ ...18_imagenet100_locked_breadth_seed123.yaml | 328 +++++++++++++ ...t18_imagenet100_locked_breadth_seed42.yaml | 328 +++++++++++++ ...18_imagenet100_locked_breadth_seed456.yaml | 328 +++++++++++++ ...18_imagenet100_locked_breadth_seed789.yaml | 328 +++++++++++++ ..._tinyimagenet_locked_breadth_seed1011.yaml | 332 +++++++++++++ ...8_tinyimagenet_locked_breadth_seed123.yaml | 332 +++++++++++++ ...18_tinyimagenet_locked_breadth_seed42.yaml | 332 +++++++++++++ ...8_tinyimagenet_locked_breadth_seed456.yaml | 332 +++++++++++++ ...8_tinyimagenet_locked_breadth_seed789.yaml | 332 +++++++++++++ ...0_imagenet100_locked_breadth_seed1011.yaml | 328 +++++++++++++ ...50_imagenet100_locked_breadth_seed123.yaml | 328 +++++++++++++ ...t50_imagenet100_locked_breadth_seed42.yaml | 328 +++++++++++++ ...50_imagenet100_locked_breadth_seed456.yaml | 328 +++++++++++++ ...50_imagenet100_locked_breadth_seed789.yaml | 328 +++++++++++++ .../vgg16_cifar10_locked_breadth_seed123.yaml | 430 +++++++++++++++++ .../vgg16_cifar10_locked_breadth_seed42.yaml | 430 +++++++++++++++++ .../vgg16_cifar10_locked_breadth_seed456.yaml | 430 +++++++++++++++++ ...6_imagenet100_locked_breadth_seed1011.yaml | 332 +++++++++++++ ...16_imagenet100_locked_breadth_seed123.yaml | 332 +++++++++++++ ...g16_imagenet100_locked_breadth_seed42.yaml | 332 +++++++++++++ ...16_imagenet100_locked_breadth_seed456.yaml | 332 +++++++++++++ ...16_imagenet100_locked_breadth_seed789.yaml | 332 +++++++++++++ ..._tinyimagenet_locked_breadth_seed1011.yaml | 332 +++++++++++++ ...6_tinyimagenet_locked_breadth_seed123.yaml | 332 +++++++++++++ ...16_tinyimagenet_locked_breadth_seed42.yaml | 332 +++++++++++++ ...6_tinyimagenet_locked_breadth_seed456.yaml | 332 +++++++++++++ ...6_tinyimagenet_locked_breadth_seed789.yaml | 332 +++++++++++++ ...tv2_cifar10_locked_extension_seed1011.yaml | 324 +++++++++++++ ...etv2_cifar10_locked_extension_seed123.yaml | 324 +++++++++++++ ...netv2_cifar10_locked_extension_seed42.yaml | 324 +++++++++++++ ...etv2_cifar10_locked_extension_seed456.yaml | 324 +++++++++++++ ...etv2_cifar10_locked_extension_seed789.yaml | 324 +++++++++++++ ...t18_cifar10_locked_extension_seed1011.yaml | 434 +++++++++++++++++ ...et18_cifar10_locked_extension_seed123.yaml | 434 +++++++++++++++++ ...net18_cifar10_locked_extension_seed42.yaml | 434 +++++++++++++++++ ...et18_cifar10_locked_extension_seed456.yaml | 434 +++++++++++++++++ ...et18_cifar10_locked_extension_seed789.yaml | 434 +++++++++++++++++ ...imagenet100_locked_extension_seed1011.yaml | 327 +++++++++++++ ..._imagenet100_locked_extension_seed123.yaml | 327 +++++++++++++ ...0_imagenet100_locked_extension_seed42.yaml | 327 +++++++++++++ ..._imagenet100_locked_extension_seed456.yaml | 327 +++++++++++++ ..._imagenet100_locked_extension_seed789.yaml | 327 +++++++++++++ ...gg16_cifar10_locked_extension_seed123.yaml | 429 +++++++++++++++++ ...vgg16_cifar10_locked_extension_seed42.yaml | 429 +++++++++++++++++ ...gg16_cifar10_locked_extension_seed456.yaml | 429 +++++++++++++++++ ...ar100_locked_headline_hybrid_seed1011.yaml | 326 +++++++++++++ ...far100_locked_headline_hybrid_seed123.yaml | 326 +++++++++++++ ...ifar100_locked_headline_hybrid_seed42.yaml | 326 +++++++++++++ ...far100_locked_headline_hybrid_seed456.yaml | 326 +++++++++++++ ...far100_locked_headline_hybrid_seed789.yaml | 326 +++++++++++++ ...ifar100_locked_headline_main_seed1011.yaml | 331 +++++++++++++ ...cifar100_locked_headline_main_seed123.yaml | 331 +++++++++++++ ..._cifar100_locked_headline_main_seed42.yaml | 331 +++++++++++++ ...cifar100_locked_headline_main_seed456.yaml | 331 +++++++++++++ ...cifar100_locked_headline_main_seed789.yaml | 331 +++++++++++++ ...ar100_locked_headline_hybrid_seed1011.yaml | 293 ++++++++++++ ...far100_locked_headline_hybrid_seed123.yaml | 293 ++++++++++++ ...ifar100_locked_headline_hybrid_seed42.yaml | 293 ++++++++++++ ...far100_locked_headline_hybrid_seed456.yaml | 293 ++++++++++++ ...far100_locked_headline_hybrid_seed789.yaml | 293 ++++++++++++ ...ifar100_locked_headline_main_seed1011.yaml | 298 ++++++++++++ ...cifar100_locked_headline_main_seed123.yaml | 298 ++++++++++++ ..._cifar100_locked_headline_main_seed42.yaml | 298 ++++++++++++ ...cifar100_locked_headline_main_seed456.yaml | 298 ++++++++++++ ...cifar100_locked_headline_main_seed789.yaml | 298 ++++++++++++ ...ar100_locked_headline_hybrid_seed1011.yaml | 293 ++++++++++++ ...far100_locked_headline_hybrid_seed123.yaml | 293 ++++++++++++ ...ifar100_locked_headline_hybrid_seed42.yaml | 293 ++++++++++++ ...far100_locked_headline_hybrid_seed456.yaml | 293 ++++++++++++ ...far100_locked_headline_hybrid_seed789.yaml | 293 ++++++++++++ ...ifar100_locked_headline_main_seed1011.yaml | 298 ++++++++++++ ...cifar100_locked_headline_main_seed123.yaml | 298 ++++++++++++ ..._cifar100_locked_headline_main_seed42.yaml | 298 ++++++++++++ ...cifar100_locked_headline_main_seed456.yaml | 298 ++++++++++++ ...cifar100_locked_headline_main_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_global_seed1011.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_global_seed123.yaml | 331 +++++++++++++ ...2_cifar100_metric_usage_global_seed42.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_global_seed456.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_global_seed789.yaml | 331 +++++++++++++ ...cifar100_metric_usage_hybrid_seed1011.yaml | 329 +++++++++++++ ..._cifar100_metric_usage_hybrid_seed123.yaml | 329 +++++++++++++ ...2_cifar100_metric_usage_hybrid_seed42.yaml | 329 +++++++++++++ ..._cifar100_metric_usage_hybrid_seed456.yaml | 329 +++++++++++++ ..._cifar100_metric_usage_hybrid_seed789.yaml | 329 +++++++++++++ ...ifar100_metric_usage_uniform_seed1011.yaml | 331 +++++++++++++ ...cifar100_metric_usage_uniform_seed123.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_uniform_seed42.yaml | 331 +++++++++++++ ...cifar100_metric_usage_uniform_seed456.yaml | 331 +++++++++++++ ...cifar100_metric_usage_uniform_seed789.yaml | 331 +++++++++++++ ...cifar100_metric_usage_global_seed1011.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed123.yaml | 298 ++++++++++++ ...8_cifar100_metric_usage_global_seed42.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed456.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_hybrid_seed1011.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed123.yaml | 296 ++++++++++++ ...8_cifar100_metric_usage_hybrid_seed42.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed456.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed789.yaml | 296 ++++++++++++ ...ifar100_metric_usage_uniform_seed1011.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed123.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_uniform_seed42.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed456.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_global_seed1011.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed123.yaml | 298 ++++++++++++ ...6_cifar100_metric_usage_global_seed42.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed456.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_hybrid_seed1011.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed123.yaml | 296 ++++++++++++ ...6_cifar100_metric_usage_hybrid_seed42.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed456.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed789.yaml | 296 ++++++++++++ ...ifar100_metric_usage_uniform_seed1011.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed123.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_uniform_seed42.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed456.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_global_seed1011.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_global_seed123.yaml | 331 +++++++++++++ ...2_cifar100_metric_usage_global_seed42.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_global_seed456.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_global_seed789.yaml | 331 +++++++++++++ ...cifar100_metric_usage_hybrid_seed1011.yaml | 329 +++++++++++++ ..._cifar100_metric_usage_hybrid_seed123.yaml | 329 +++++++++++++ ...2_cifar100_metric_usage_hybrid_seed42.yaml | 329 +++++++++++++ ..._cifar100_metric_usage_hybrid_seed456.yaml | 329 +++++++++++++ ..._cifar100_metric_usage_hybrid_seed789.yaml | 329 +++++++++++++ ...ifar100_metric_usage_uniform_seed1011.yaml | 331 +++++++++++++ ...cifar100_metric_usage_uniform_seed123.yaml | 331 +++++++++++++ ..._cifar100_metric_usage_uniform_seed42.yaml | 331 +++++++++++++ ...cifar100_metric_usage_uniform_seed456.yaml | 331 +++++++++++++ ...cifar100_metric_usage_uniform_seed789.yaml | 331 +++++++++++++ ...cifar100_metric_usage_global_seed1011.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed123.yaml | 298 ++++++++++++ ...8_cifar100_metric_usage_global_seed42.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed456.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_hybrid_seed1011.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed123.yaml | 296 ++++++++++++ ...8_cifar100_metric_usage_hybrid_seed42.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed456.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed789.yaml | 296 ++++++++++++ ...ifar100_metric_usage_uniform_seed1011.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed123.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_uniform_seed42.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed456.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_global_seed1011.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed123.yaml | 298 ++++++++++++ ...6_cifar100_metric_usage_global_seed42.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed456.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_global_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_usage_hybrid_seed1011.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed123.yaml | 296 ++++++++++++ ...6_cifar100_metric_usage_hybrid_seed42.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed456.yaml | 296 ++++++++++++ ..._cifar100_metric_usage_hybrid_seed789.yaml | 296 ++++++++++++ ...ifar100_metric_usage_uniform_seed1011.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed123.yaml | 298 ++++++++++++ ..._cifar100_metric_usage_uniform_seed42.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed456.yaml | 298 ++++++++++++ ...cifar100_metric_usage_uniform_seed789.yaml | 298 ++++++++++++ ...cifar100_metric_weight_sweep_seed1011.yaml | 332 +++++++++++++ ..._cifar100_metric_weight_sweep_seed123.yaml | 332 +++++++++++++ ...2_cifar100_metric_weight_sweep_seed42.yaml | 332 +++++++++++++ ..._cifar100_metric_weight_sweep_seed456.yaml | 332 +++++++++++++ ..._cifar100_metric_weight_sweep_seed789.yaml | 332 +++++++++++++ ...cifar100_metric_weight_sweep_seed1011.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed123.yaml | 299 ++++++++++++ ...8_cifar100_metric_weight_sweep_seed42.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed456.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed789.yaml | 299 ++++++++++++ ...cifar100_metric_weight_sweep_seed1011.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed123.yaml | 299 ++++++++++++ ...6_cifar100_metric_weight_sweep_seed42.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed456.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed789.yaml | 299 ++++++++++++ ...cifar100_metric_weight_sweep_seed1011.yaml | 332 +++++++++++++ ..._cifar100_metric_weight_sweep_seed123.yaml | 332 +++++++++++++ ...2_cifar100_metric_weight_sweep_seed42.yaml | 332 +++++++++++++ ..._cifar100_metric_weight_sweep_seed456.yaml | 332 +++++++++++++ ..._cifar100_metric_weight_sweep_seed789.yaml | 332 +++++++++++++ ...cifar100_metric_weight_sweep_seed1011.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed123.yaml | 299 ++++++++++++ ...8_cifar100_metric_weight_sweep_seed42.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed456.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed789.yaml | 299 ++++++++++++ ...cifar100_metric_weight_sweep_seed1011.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed123.yaml | 299 ++++++++++++ ...6_cifar100_metric_weight_sweep_seed42.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed456.yaml | 299 ++++++++++++ ..._cifar100_metric_weight_sweep_seed789.yaml | 299 ++++++++++++ ...resnet50_imagenet100_two_axis_seed123.yaml | 326 +++++++++++++ .../resnet50_imagenet100_two_axis_seed42.yaml | 326 +++++++++++++ ...mobilenetv2_cifar100_two_axis_seed123.yaml | 328 +++++++++++++ .../mobilenetv2_cifar100_two_axis_seed42.yaml | 328 +++++++++++++ ...mobilenetv2_cifar100_two_axis_seed456.yaml | 328 +++++++++++++ .../resnet18_cifar100_two_axis_seed123.yaml | 295 ++++++++++++ .../resnet18_cifar100_two_axis_seed42.yaml | 295 ++++++++++++ .../resnet18_cifar100_two_axis_seed456.yaml | 295 ++++++++++++ .../vgg16_cifar100_two_axis_seed123.yaml | 295 ++++++++++++ .../vgg16_cifar100_two_axis_seed42.yaml | 295 ++++++++++++ .../vgg16_cifar100_two_axis_seed456.yaml | 295 ++++++++++++ scripts/run_experiment.py | 77 +++- .../analysis/clustering/metric_clustering.py | 62 ++- .../analysis/visualization/cluster_plots.py | 7 +- src/alignment/configs/config_loader.py | 29 +- .../experiments/cluster_experiments.py | 90 +++- src/alignment/experiments/llm_experiments.py | 221 ++++++++- .../pruning/strategies/metric_based.py | 52 ++- 251 files changed, 77082 insertions(+), 98 deletions(-) create mode 100644 configs/prune_llm/llama3_70b_scale_benchmarks_50_papersafe.yaml create mode 100644 configs/prune_llm/llama3_70b_scale_mechanism.yaml create mode 100644 configs/prune_llm/llama3_70b_scale_pruning_curves.yaml create mode 100644 configs/prune_llm/llama3_70b_scale_sparsegpt_curves.yaml create mode 100644 configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked/convnext_tiny_imagenet100_smoke.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed1011.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed789.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed456.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed123.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed42.yaml create mode 100644 configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed456.yaml diff --git a/configs/prune_llm/README.md b/configs/prune_llm/README.md index 39733d07..1845c9bc 100644 --- a/configs/prune_llm/README.md +++ b/configs/prune_llm/README.md @@ -13,14 +13,19 @@ Configurations for generating results in the SCAR LLM pruning paper. ## Quick Start -Run all experiments: +Run single model: ```bash -bash slurm_jobs/prune_llm/run_all_paper.sh +python scripts/run_experiment.py --config configs/prune_llm/llama3_8b_unified.yaml ``` -Run single model: +Paper batch launchers now live under: ```bash -python scripts/run_experiment.py --config configs/prune_llm/llama3_8b_unified.yaml +drafts/LLM_prune/paper/slurm_jobs/ +``` + +See: +```bash +drafts/LLM_prune/paper/slurm_jobs/README.md ``` Override base output directory: diff --git a/configs/prune_llm/llama3_70b_scale_benchmarks_50_papersafe.yaml b/configs/prune_llm/llama3_70b_scale_benchmarks_50_papersafe.yaml new file mode 100644 index 00000000..2c947dac --- /dev/null +++ b/configs/prune_llm/llama3_70b_scale_benchmarks_50_papersafe.yaml @@ -0,0 +1,146 @@ +# ============================================================================ +# LLAMA-3.1-70B PAPER-SAFE BENCHMARK RERUN: 50% FFN PRUNING + 8B-MATCHED TASKS +# ============================================================================ +# +# Purpose: +# - Upgrade the preliminary 70B benchmark table from 100 samples/task to a more +# defensible evaluation budget for main-paper use. +# - Match the 8B benchmark suite more closely by including ARC-E and OBQA. +# - Keep the method set focused on the main comparisons that matter in the paper. +# +# Recommended use: +# - Promote 70B task results into the main paper only after this rerun finishes. +# ============================================================================ + +experiment: + name: "llama3_70b_scale_benchmarks_50_papersafe" + type: "llm_alignment" + output_dir: "./results/paper/llama3_70b_scale_benchmarks_50_papersafe" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "meta-llama/Llama-3.1-70B" + dtype: "bfloat16" + device_map: "auto" + trust_remote_code: true + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "activation_l2_norm" + num_samples: 64 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: true + evaluation_num_samples: 500 + use_nvidia_fewshot: true + perplexity_protocol: "oats" + wikitext_subset: "wikitext-2-raw-v1" + perplexity_seq_len: 2048 + + evaluation_metrics: + - "perplexity" + - "accuracy_mmlu" + - "accuracy_hellaswag" + - "accuracy_piqa" + - "accuracy_boolq" + - "accuracy_winogrande" + - "accuracy_arc_easy" + - "accuracy_arc_challenge" + - "accuracy_openbookqa" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: true +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + - "supernode_protection_score" + - "supernode_connectivity_score" + cross_layer_analysis: false + compare_by_connection: true + compute_metrics: + - "activation" + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: false + outlier_analysis: false + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: true + target: "ffn" + structured: true + dependency_aware: true + distribution: "uniform" + min_per_layer: 0.0 + max_per_layer: 0.95 + + sparsity_levels: [0.5] + selection_modes: ["low"] + algorithms: + - "scar_loss_proxy" + - "supernode_protection_score" + - "supernode_connectivity_score" + - "activation_l2_norm" + - "weight_magnitude" + - "wanda" + +evaluation: + enabled: true diff --git a/configs/prune_llm/llama3_70b_scale_mechanism.yaml b/configs/prune_llm/llama3_70b_scale_mechanism.yaml new file mode 100644 index 00000000..c9fbefdb --- /dev/null +++ b/configs/prune_llm/llama3_70b_scale_mechanism.yaml @@ -0,0 +1,126 @@ +# ============================================================================ +# LLAMA-3.1-70B SCALE CHECK: MECHANISM / CONCENTRATION ONLY +# ============================================================================ +# +# Purpose: +# - Test whether the main supernode concentration phenomenon persists at 70B. +# - Keep the run limited to the reviewer-relevant scale question: +# - LP concentration / supernode identification +# - optional outlier summary +# - Skip pruning sweeps, downstream tasks, halo controls, and true ablation probes. +# +# This is designed as a rebuttal-focused replication, not a full paper rerun. +# ============================================================================ + +experiment: + name: "llama3_70b_scale_mechanism" + type: "llm_alignment" + output_dir: "./results/paper/llama3_70b_scale_mechanism" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "meta-llama/Llama-3.1-70B" + dtype: "bfloat16" + device_map: "auto" + trust_remote_code: true + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/llama3_70b_scale_pruning_curves.yaml b/configs/prune_llm/llama3_70b_scale_pruning_curves.yaml new file mode 100644 index 00000000..75f7c40d --- /dev/null +++ b/configs/prune_llm/llama3_70b_scale_pruning_curves.yaml @@ -0,0 +1,137 @@ +# ============================================================================ +# LLAMA-3.1-70B SCALE CHECK: STRUCTURED PRUNING CURVES +# ============================================================================ +# +# Purpose: +# - Extend the 70B scale check beyond a single 50% point. +# - Produce pruning curves across a few key sparsity levels for the main methods: +# SCAR-LP, SCAR-Prot, SCAR-Conn, activation magnitude, weight magnitude, +# Wanda, and random. +# - Evaluate perplexity only to keep runtime tractable on 70B. +# +# This is intended to support paper-quality scale figures / appendix tables. +# ============================================================================ + +experiment: + name: "llama3_70b_scale_pruning_curves" + type: "llm_alignment" + output_dir: "./results/paper/llama3_70b_scale_pruning_curves" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "meta-llama/Llama-3.1-70B" + dtype: "bfloat16" + device_map: "auto" + trust_remote_code: true + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "activation_l2_norm" + num_samples: 64 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: true + evaluation_num_samples: 100 + evaluation_metrics: + - "perplexity" + perplexity_protocol: "oats" + wikitext_subset: "wikitext-2-raw-v1" + perplexity_seq_len: 2048 + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: true +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + - "supernode_protection_score" + - "supernode_connectivity_score" + cross_layer_analysis: false + compare_by_connection: true + compute_metrics: + - "activation" + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: false + outlier_analysis: false + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: true + target: "ffn" + structured: true + dependency_aware: true + distribution: "uniform" + min_per_layer: 0.0 + max_per_layer: 0.95 + + sparsity_levels: [0.3, 0.5, 0.7] + selection_modes: ["low"] + algorithms: + - "scar_loss_proxy" + - "supernode_protection_score" + - "supernode_connectivity_score" + - "activation_l2_norm" + - "weight_magnitude" + - "wanda" + - "random" + +evaluation: + enabled: true diff --git a/configs/prune_llm/llama3_70b_scale_sparsegpt_curves.yaml b/configs/prune_llm/llama3_70b_scale_sparsegpt_curves.yaml new file mode 100644 index 00000000..9c708b31 --- /dev/null +++ b/configs/prune_llm/llama3_70b_scale_sparsegpt_curves.yaml @@ -0,0 +1,92 @@ +# ============================================================================ +# LLAMA-3.1-70B SCALE CHECK: SPARSEGPT CURVES +# ============================================================================ +# +# Purpose: +# - Complete the 70B scale-comparison figure/table for SparseGPT so the method +# is reported consistently across 30% / 50% / 70% FFN sparsity. +# - Keep the run narrow: +# - SparseGPT only +# - perplexity only +# - same calibration/eval budget as the other 70B scale checks +# ============================================================================ + +experiment: + name: "llama3_70b_scale_sparsegpt_curves" + type: "llm_alignment" + output_dir: "./results/paper/llama3_70b_scale_sparsegpt_curves" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "meta-llama/Llama-3.1-70B" + dtype: "bfloat16" + device_map: "auto" + trust_remote_code: true + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "activation_l2_norm" + num_samples: 64 + +do_scar_metrics: false + +llm: + evaluate_perplexity: true + evaluation_num_samples: 100 + evaluation_metrics: + - "perplexity" + perplexity_protocol: "oats" + wikitext_subset: "wikitext-2-raw-v1" + perplexity_seq_len: 2048 + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +pruning: + enabled: true + target: "ffn" + structured: true + dependency_aware: true + distribution: "uniform" + min_per_layer: 0.0 + max_per_layer: 0.95 + + sparsity_levels: [0.3, 0.5, 0.7] + selection_modes: ["low"] + algorithms: + - "sparsegpt" + + single_strategy: "sparsegpt" + +evaluation: + enabled: true diff --git a/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed123.yaml b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed123.yaml new file mode 100644 index 00000000..1575d971 --- /dev/null +++ b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed123.yaml @@ -0,0 +1,249 @@ +{ + "name": "convnext_tiny_imagenet100_locked_pilot_seed123", + "description": "ConvNeXt-T ImageNet-100 pilot for NeurIPS 2026 (locked pruning subset)", + "tags": ["paper2026", "convnext", "imagenet100", "pilot_pruning", "seed123"], + "experiment_type": "cluster_analysis", + "model_name": "torchvision_model", + "model_config": { + "model_name": "convnext_tiny" + }, + "pretrained": true, + "model_checkpoint": null, + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": true, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": true, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": ["all", "rq_red", "rq_syn", "red_syn"], + "run_permutation_baseline": false, + "n_permutations": 50, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [0.0, 0.1, 0.3, 0.5, 0.7, 0.9], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red" + ], + "pruning_amounts": [0.1, 0.3, 0.5, 0.7, 0.8, 0.9], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": false + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/convnext_tiny_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/convnext_tiny_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/convnext_pilot_pruning_20260409", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": ["perplexity"], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": false, + "do_connectivity_pruning": false, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": false, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": false + }, + "generalized_importance": {}, + "do_halo_analysis": false, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed42.yaml b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed42.yaml new file mode 100644 index 00000000..0b4bc94f --- /dev/null +++ b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed42.yaml @@ -0,0 +1,249 @@ +{ + "name": "convnext_tiny_imagenet100_locked_pilot_seed42", + "description": "ConvNeXt-T ImageNet-100 pilot for NeurIPS 2026 (locked pruning subset)", + "tags": ["paper2026", "convnext", "imagenet100", "pilot_pruning", "seed42"], + "experiment_type": "cluster_analysis", + "model_name": "torchvision_model", + "model_config": { + "model_name": "convnext_tiny" + }, + "pretrained": true, + "model_checkpoint": null, + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": true, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": true, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": ["all", "rq_red", "rq_syn", "red_syn"], + "run_permutation_baseline": false, + "n_permutations": 50, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [0.0, 0.1, 0.3, 0.5, 0.7, 0.9], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red" + ], + "pruning_amounts": [0.1, 0.3, 0.5, 0.7, 0.8, 0.9], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": false + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/convnext_tiny_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/convnext_tiny_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/convnext_pilot_pruning_20260409", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": ["perplexity"], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": false, + "do_connectivity_pruning": false, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": false, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": false + }, + "generalized_importance": {}, + "do_halo_analysis": false, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed456.yaml b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed456.yaml new file mode 100644 index 00000000..31a5e2d0 --- /dev/null +++ b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_locked_pilot_seed456.yaml @@ -0,0 +1,249 @@ +{ + "name": "convnext_tiny_imagenet100_locked_pilot_seed456", + "description": "ConvNeXt-T ImageNet-100 pilot for NeurIPS 2026 (locked pruning subset)", + "tags": ["paper2026", "convnext", "imagenet100", "pilot_pruning", "seed456"], + "experiment_type": "cluster_analysis", + "model_name": "torchvision_model", + "model_config": { + "model_name": "convnext_tiny" + }, + "pretrained": true, + "model_checkpoint": null, + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": true, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": true, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": ["all", "rq_red", "rq_syn", "red_syn"], + "run_permutation_baseline": false, + "n_permutations": 50, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [0.0, 0.1, 0.3, 0.5, 0.7, 0.9], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red" + ], + "pruning_amounts": [0.1, 0.3, 0.5, 0.7, 0.8, 0.9], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": false + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/convnext_tiny_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/convnext_tiny_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/convnext_pilot_pruning_20260409", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": ["perplexity"], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": false, + "do_connectivity_pruning": false, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": false, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": false + }, + "generalized_importance": {}, + "do_halo_analysis": false, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed123.yaml b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed123.yaml new file mode 100644 index 00000000..c1c1ca2c --- /dev/null +++ b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed123.yaml @@ -0,0 +1,213 @@ +{ + "name": "convnext_tiny_imagenet100_struct_seed123", + "description": "ConvNeXt-T ImageNet-100 pilot for NeurIPS 2026 (structural consistency)", + "tags": ["paper2026", "convnext", "imagenet100", "pilot_structural", "seed123"], + "experiment_type": "cluster_analysis", + "model_name": "torchvision_model", + "model_config": { + "model_name": "convnext_tiny" + }, + "pretrained": true, + "model_checkpoint": null, + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": true, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": true, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": ["all", "rq_red", "rq_syn", "red_syn"], + "run_permutation_baseline": false, + "n_permutations": 50, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": false + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/convnext_tiny_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/convnext_tiny_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/convnext_pilot_structural_20260409", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": ["perplexity"], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": false, + "do_connectivity_pruning": false, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": false, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": false + }, + "generalized_importance": {}, + "do_halo_analysis": false, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed42.yaml b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed42.yaml new file mode 100644 index 00000000..679aeda7 --- /dev/null +++ b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed42.yaml @@ -0,0 +1,213 @@ +{ + "name": "convnext_tiny_imagenet100_struct_seed42", + "description": "ConvNeXt-T ImageNet-100 pilot for NeurIPS 2026 (structural consistency)", + "tags": ["paper2026", "convnext", "imagenet100", "pilot_structural", "seed42"], + "experiment_type": "cluster_analysis", + "model_name": "torchvision_model", + "model_config": { + "model_name": "convnext_tiny" + }, + "pretrained": true, + "model_checkpoint": null, + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": true, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": true, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": ["all", "rq_red", "rq_syn", "red_syn"], + "run_permutation_baseline": false, + "n_permutations": 50, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": false + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/convnext_tiny_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/convnext_tiny_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/convnext_pilot_structural_20260409", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": ["perplexity"], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": false, + "do_connectivity_pruning": false, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": false, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": false + }, + "generalized_importance": {}, + "do_halo_analysis": false, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed456.yaml b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed456.yaml new file mode 100644 index 00000000..2cb01850 --- /dev/null +++ b/configs/vision_prune/paper_2026_convnext_pilot/convnext_tiny_imagenet100_struct_seed456.yaml @@ -0,0 +1,213 @@ +{ + "name": "convnext_tiny_imagenet100_struct_seed456", + "description": "ConvNeXt-T ImageNet-100 pilot for NeurIPS 2026 (structural consistency)", + "tags": ["paper2026", "convnext", "imagenet100", "pilot_structural", "seed456"], + "experiment_type": "cluster_analysis", + "model_name": "torchvision_model", + "model_config": { + "model_name": "convnext_tiny" + }, + "pretrained": true, + "model_checkpoint": null, + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": true, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": true, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": ["all", "rq_red", "rq_syn", "red_syn"], + "run_permutation_baseline": false, + "n_permutations": 50, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": false + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/convnext_tiny_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/convnext_tiny_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/convnext_pilot_structural_20260409", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": ["perplexity"], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": false, + "do_connectivity_pruning": false, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": false, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": false + }, + "generalized_importance": {}, + "do_halo_analysis": false, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_locked/convnext_tiny_imagenet100_smoke.yaml b/configs/vision_prune/paper_2026_locked/convnext_tiny_imagenet100_smoke.yaml new file mode 100644 index 00000000..ca1ff6ea --- /dev/null +++ b/configs/vision_prune/paper_2026_locked/convnext_tiny_imagenet100_smoke.yaml @@ -0,0 +1,204 @@ +{ + "name": "convnext_tiny_imagenet100_smoke", + "description": "Low-cost compatibility pilot for a modern CNN backbone in the NeurIPS 2026 paper pipeline using the pretrained ImageNet-1K head.", + "tags": ["paper_2026", "convnext", "smoke_test"], + "experiment_type": "cluster_analysis", + "model_name": "torchvision_model", + "model_config": { + "model_name": "convnext_tiny" + }, + "pretrained": true, + "model_checkpoint": null, + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 16, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": true, + "training_epochs": 1, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": true, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 5, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": false, + "measure_alignment_during_training": false, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 64, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 8, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 20, + "synergy_pairs": 5, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": ["all", "rq_red", "rq_syn", "red_syn"], + "run_permutation_baseline": false, + "n_permutations": 20, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 64, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 128, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 64, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": false, + "generate_plots": false, + "plot_format": "pdf", + "plot_dpi": 300, + "visualization_options": { + "enabled": false + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/convnext_tiny_imagenet100_smoke/checkpoints", + "checkpoint_interval": 1000, + "save_best": false, + "log_dir": "./results/vision/convnext_tiny_imagenet100_smoke", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": ["perplexity"], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": false, + "do_connectivity_pruning": false, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": false, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": false + }, + "generalized_importance": {}, + "do_halo_analysis": false, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": 2, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {} +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..73361693 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed1011.yaml @@ -0,0 +1,305 @@ +{ + "name": "alexnet_imagenet100_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "alexnet", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "alexnet", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/alexnet_imagenet100_cluster_analysis_s50refine_20260306_073243_64241303/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.5, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 150, + "visualization_options": { + "enabled": true, + "save_format": "png", + "dpi": 150, + "generate": [ + "metric_distributions", + "cluster_scatter", + "cluster_evolution", + "halo_influence_matrix", + "pruning_curves", + "cascade_damage" + ] + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/alexnet_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/alexnet_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "threshold_percentile": 90, + "influence_type": "activation_weighted", + "skip_residual_edges": false + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed123.yaml new file mode 100644 index 00000000..8c703155 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed123.yaml @@ -0,0 +1,305 @@ +{ + "name": "alexnet_imagenet100_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "alexnet", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "alexnet", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/alexnet_imagenet100_cluster_analysis_s50refine_20260306_064154_64241300/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.5, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 150, + "visualization_options": { + "enabled": true, + "save_format": "png", + "dpi": 150, + "generate": [ + "metric_distributions", + "cluster_scatter", + "cluster_evolution", + "halo_influence_matrix", + "pruning_curves", + "cascade_damage" + ] + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/alexnet_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/alexnet_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "threshold_percentile": 90, + "influence_type": "activation_weighted", + "skip_residual_edges": false + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed42.yaml new file mode 100644 index 00000000..3e2b6de2 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed42.yaml @@ -0,0 +1,305 @@ +{ + "name": "alexnet_imagenet100_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "alexnet", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "alexnet", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/alexnet_imagenet100_cluster_analysis_s50refine_20260306_063506_64241299/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.5, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 150, + "visualization_options": { + "enabled": true, + "save_format": "png", + "dpi": 150, + "generate": [ + "metric_distributions", + "cluster_scatter", + "cluster_evolution", + "halo_influence_matrix", + "pruning_curves", + "cascade_damage" + ] + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/alexnet_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/alexnet_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "threshold_percentile": 90, + "influence_type": "activation_weighted", + "skip_residual_edges": false + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed456.yaml new file mode 100644 index 00000000..9f444c70 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed456.yaml @@ -0,0 +1,305 @@ +{ + "name": "alexnet_imagenet100_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "alexnet", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "alexnet", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/alexnet_imagenet100_cluster_analysis_s50refine_20260306_070452_64241301/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.5, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 150, + "visualization_options": { + "enabled": true, + "save_format": "png", + "dpi": 150, + "generate": [ + "metric_distributions", + "cluster_scatter", + "cluster_evolution", + "halo_influence_matrix", + "pruning_curves", + "cascade_damage" + ] + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/alexnet_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/alexnet_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "threshold_percentile": 90, + "influence_type": "activation_weighted", + "skip_residual_edges": false + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed789.yaml new file mode 100644 index 00000000..4dbcf9f3 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/alexnet_imagenet100_locked_breadth_seed789.yaml @@ -0,0 +1,305 @@ +{ + "name": "alexnet_imagenet100_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "alexnet", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "alexnet", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/alexnet_imagenet100_cluster_analysis_s50refine_20260306_072619_64241302/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.5, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 150, + "visualization_options": { + "enabled": true, + "save_format": "png", + "dpi": 150, + "generate": [ + "metric_distributions", + "cluster_scatter", + "cluster_evolution", + "halo_influence_matrix", + "pruning_curves", + "cascade_damage" + ] + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/alexnet_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/alexnet_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "threshold_percentile": 90, + "influence_type": "activation_weighted", + "skip_residual_edges": false + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..9638f661 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed1011.yaml @@ -0,0 +1,325 @@ +{ + "name": "mobilenetv2_cifar10_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260207_193655_59409622/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed123.yaml new file mode 100644 index 00000000..db4b3718 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed123.yaml @@ -0,0 +1,325 @@ +{ + "name": "mobilenetv2_cifar10_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260126_123830_57082561/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed42.yaml new file mode 100644 index 00000000..591f0c2c --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed42.yaml @@ -0,0 +1,325 @@ +{ + "name": "mobilenetv2_cifar10_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260126_123831_57082560/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed456.yaml new file mode 100644 index 00000000..75002bf0 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed456.yaml @@ -0,0 +1,325 @@ +{ + "name": "mobilenetv2_cifar10_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260126_123831_57082559/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed789.yaml new file mode 100644 index 00000000..3046ed6b --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_cifar10_locked_breadth_seed789.yaml @@ -0,0 +1,325 @@ +{ + "name": "mobilenetv2_cifar10_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260207_193700_59409621/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..07993035 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed1011.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_imagenet100_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_imagenet100_cluster_analysis_s50refine_20260306_054545_64241293/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed123.yaml new file mode 100644 index 00000000..b44f66e9 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed123.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_imagenet100_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_imagenet100_cluster_analysis_s50refine_20260306_045426_64241290/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed42.yaml new file mode 100644 index 00000000..98fa45cd --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed42.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_imagenet100_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_imagenet100_cluster_analysis_s50refine_20260306_043742_64241289/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed456.yaml new file mode 100644 index 00000000..066f6d66 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed456.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_imagenet100_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_imagenet100_cluster_analysis_s50refine_20260306_045917_64241291/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed789.yaml new file mode 100644 index 00000000..dfce3e68 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_imagenet100_locked_breadth_seed789.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_imagenet100_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_imagenet100_cluster_analysis_s50refine_20260306_052432_64241292/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..a90560a7 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed1011.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_tinyimagenet_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenetv2", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_tinyimagenet_cluster_analysis_seed1011_20260218_073158_60879287/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed123.yaml new file mode 100644 index 00000000..9b2f3bd5 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed123.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_tinyimagenet_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenetv2", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_tinyimagenet_cluster_analysis_seed123_20260218_052659_60939090/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed42.yaml new file mode 100644 index 00000000..e7550525 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed42.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_tinyimagenet_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenetv2", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_tinyimagenet_cluster_analysis_seed42_20260218_045227_60937327/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed456.yaml new file mode 100644 index 00000000..94f914f9 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed456.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_tinyimagenet_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenetv2", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_tinyimagenet_cluster_analysis_seed456_20260218_062612_60942010/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed789.yaml new file mode 100644 index 00000000..0ad80696 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/mobilenetv2_tinyimagenet_locked_breadth_seed789.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_tinyimagenet_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenetv2", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_tinyimagenet_cluster_analysis_seed789_20260218_070006_60944058/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..e317e180 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed1011.yaml @@ -0,0 +1,435 @@ +{ + "name": "resnet18_cifar10_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260206_162747_59203880/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed123.yaml new file mode 100644 index 00000000..96b33bfd --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed123.yaml @@ -0,0 +1,435 @@ +{ + "name": "resnet18_cifar10_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260126_123831_57082554/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed42.yaml new file mode 100644 index 00000000..15cc3b96 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed42.yaml @@ -0,0 +1,435 @@ +{ + "name": "resnet18_cifar10_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260126_123830_57082553/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed456.yaml new file mode 100644 index 00000000..ed4a0013 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed456.yaml @@ -0,0 +1,435 @@ +{ + "name": "resnet18_cifar10_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260126_123832_57082549/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed789.yaml new file mode 100644 index 00000000..f7029f05 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_cifar10_locked_breadth_seed789.yaml @@ -0,0 +1,435 @@ +{ + "name": "resnet18_cifar10_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260206_162741_59203877/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..665ceade --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed1011.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet18_imagenet100_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_imagenet100_cluster_analysis_s50refine_20260306_035935_64241280/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed123.yaml new file mode 100644 index 00000000..79f78ebb --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed123.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet18_imagenet100_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_imagenet100_cluster_analysis_s50refine_20260306_023255_64241274/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed42.yaml new file mode 100644 index 00000000..b2a3a056 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed42.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet18_imagenet100_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_imagenet100_cluster_analysis_s50refine_20260306_023248_64241272/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed456.yaml new file mode 100644 index 00000000..9f56d197 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed456.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet18_imagenet100_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_imagenet100_cluster_analysis_s50refine_20260306_023642_64241276/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed789.yaml new file mode 100644 index 00000000..3aca8802 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_imagenet100_locked_breadth_seed789.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet18_imagenet100_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_imagenet100_cluster_analysis_s50refine_20260306_025815_64241278/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..f87609ee --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed1011.yaml @@ -0,0 +1,332 @@ +{ + "name": "resnet18_tinyimagenet_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_tinyimagenet_cluster_analysis_seed1011_20260218_010016_60919345/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed123.yaml new file mode 100644 index 00000000..0b2b42a1 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed123.yaml @@ -0,0 +1,332 @@ +{ + "name": "resnet18_tinyimagenet_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_tinyimagenet_cluster_analysis_seed123_20260217_223946_60879289/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed42.yaml new file mode 100644 index 00000000..4ef9b051 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed42.yaml @@ -0,0 +1,332 @@ +{ + "name": "resnet18_tinyimagenet_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_tinyimagenet_cluster_analysis_seed42_20260217_223944_60879288/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed456.yaml new file mode 100644 index 00000000..7b628be3 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed456.yaml @@ -0,0 +1,332 @@ +{ + "name": "resnet18_tinyimagenet_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_tinyimagenet_cluster_analysis_seed456_20260217_235456_60903461/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed789.yaml new file mode 100644 index 00000000..5069bca7 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet18_tinyimagenet_locked_breadth_seed789.yaml @@ -0,0 +1,332 @@ +{ + "name": "resnet18_tinyimagenet_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_tinyimagenet_cluster_analysis_seed789_20260218_000359_60904287/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..ea097fda --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed1011.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet50_imagenet100_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet50", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_030010_64241279/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed123.yaml new file mode 100644 index 00000000..27354ffb --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed123.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet50_imagenet100_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet50", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023258_64241273/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed42.yaml new file mode 100644 index 00000000..4633c6aa --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed42.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet50_imagenet100_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet50", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023248_64241271/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed456.yaml new file mode 100644 index 00000000..e7c7d440 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed456.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet50_imagenet100_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet50", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023256_64241275/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed789.yaml new file mode 100644 index 00000000..edb4caf2 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/resnet50_imagenet100_locked_breadth_seed789.yaml @@ -0,0 +1,328 @@ +{ + "name": "resnet50_imagenet100_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "resnet50", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_025559_64241277/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed123.yaml new file mode 100644 index 00000000..80ec125d --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed123.yaml @@ -0,0 +1,430 @@ +{ + "name": "vgg16_cifar10_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar10_cluster_analysis_20260126_123830_57082556/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed42.yaml new file mode 100644 index 00000000..2fe3f782 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed42.yaml @@ -0,0 +1,430 @@ +{ + "name": "vgg16_cifar10_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar10_cluster_analysis_20260126_123831_57082555/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed456.yaml new file mode 100644 index 00000000..32f8f59c --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_cifar10_locked_breadth_seed456.yaml @@ -0,0 +1,430 @@ +{ + "name": "vgg16_cifar10_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "cifar10", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar10_cluster_analysis_20260126_123830_57082552/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..6ce91e28 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed1011.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_imagenet100_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_imagenet100_cluster_analysis_s50refine_20260306_043333_64241285/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "rq", + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed123.yaml new file mode 100644 index 00000000..22ea1ea5 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed123.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_imagenet100_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_imagenet100_cluster_analysis_s50refine_20260306_040222_64241282/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "rq", + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed42.yaml new file mode 100644 index 00000000..a8ac094a --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed42.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_imagenet100_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_imagenet100_cluster_analysis_s50refine_20260306_040107_64241281/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "rq", + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed456.yaml new file mode 100644 index 00000000..c8318a12 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed456.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_imagenet100_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_imagenet100_cluster_analysis_s50refine_20260306_042343_64241283/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "rq", + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed789.yaml new file mode 100644 index 00000000..fdd532df --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_imagenet100_locked_breadth_seed789.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_imagenet100_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "imagenet100", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_imagenet100_cluster_analysis_s50refine_20260306_043236_64241284/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 20, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "clustering_first_metric": "rq", + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "taylor_act_samples": 1024, + "taylor_act_batch_size": 16, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "chip_images": 256, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed1011.yaml new file mode 100644 index 00000000..74d90a76 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed1011.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_tinyimagenet_locked_breadth_seed1011", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_tinyimagenet_cluster_analysis_seed1011_20260218_040351_60935233/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed123.yaml new file mode 100644 index 00000000..92b4677f --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed123.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_tinyimagenet_locked_breadth_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_tinyimagenet_cluster_analysis_seed123_20260218_020537_60926060/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed42.yaml new file mode 100644 index 00000000..e5486a36 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed42.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_tinyimagenet_locked_breadth_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_tinyimagenet_cluster_analysis_seed42_20260218_011136_60921155/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed456.yaml new file mode 100644 index 00000000..d1a05932 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed456.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_tinyimagenet_locked_breadth_seed456", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_tinyimagenet_cluster_analysis_seed456_20260218_024046_60929405/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed789.yaml new file mode 100644 index 00000000..196f8ec1 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_breadth/vgg16_tinyimagenet_locked_breadth_seed789.yaml @@ -0,0 +1,332 @@ +{ + "name": "vgg16_tinyimagenet_locked_breadth_seed789", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "locked_benchmark_breadth", + "tinyimagenet", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16", + "model_config": { + "num_classes": 200 + }, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_tinyimagenet_cluster_analysis_seed789_20260218_032859_60932756/checkpoints/trained_model.pth", + "dataset_name": "tinyimagenet", + "dataset_config": { + "root": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200" + }, + "data_path": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/DATA/tiny-imagenet-200", + "batch_size": 128, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "chip", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_tinyimagenet/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_tinyimagenet", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_breadth_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed1011.yaml new file mode 100644 index 00000000..b217b478 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed1011.yaml @@ -0,0 +1,324 @@ +{ + "name": "mobilenetv2_cifar10_locked_extension_seed1011", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260207_193655_59409622/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed123.yaml new file mode 100644 index 00000000..3550ea26 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed123.yaml @@ -0,0 +1,324 @@ +{ + "name": "mobilenetv2_cifar10_locked_extension_seed123", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260126_123830_57082561/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed42.yaml new file mode 100644 index 00000000..1eaa8a30 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed42.yaml @@ -0,0 +1,324 @@ +{ + "name": "mobilenetv2_cifar10_locked_extension_seed42", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260126_123831_57082560/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed456.yaml new file mode 100644 index 00000000..ee52210f --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed456.yaml @@ -0,0 +1,324 @@ +{ + "name": "mobilenetv2_cifar10_locked_extension_seed456", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260126_123831_57082559/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed789.yaml new file mode 100644 index 00000000..7ae16db3 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/mobilenetv2_cifar10_locked_extension_seed789.yaml @@ -0,0 +1,324 @@ +{ + "name": "mobilenetv2_cifar10_locked_extension_seed789", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar10_cluster_analysis_20260207_193700_59409621/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed1011.yaml new file mode 100644 index 00000000..88c880d8 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed1011.yaml @@ -0,0 +1,434 @@ +{ + "name": "resnet18_cifar10_locked_extension_seed1011", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260206_162747_59203880/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed123.yaml new file mode 100644 index 00000000..3760b5db --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed123.yaml @@ -0,0 +1,434 @@ +{ + "name": "resnet18_cifar10_locked_extension_seed123", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260126_123831_57082554/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed42.yaml new file mode 100644 index 00000000..62f3fbb7 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed42.yaml @@ -0,0 +1,434 @@ +{ + "name": "resnet18_cifar10_locked_extension_seed42", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260126_123830_57082553/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed456.yaml new file mode 100644 index 00000000..b2edb903 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed456.yaml @@ -0,0 +1,434 @@ +{ + "name": "resnet18_cifar10_locked_extension_seed456", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260126_123832_57082549/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed789.yaml new file mode 100644 index 00000000..016026b8 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet18_cifar10_locked_extension_seed789.yaml @@ -0,0 +1,434 @@ +{ + "name": "resnet18_cifar10_locked_extension_seed789", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar10_cluster_analysis_20260206_162741_59203877/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": true, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed1011.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed1011.yaml new file mode 100644 index 00000000..b6670689 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed1011.yaml @@ -0,0 +1,327 @@ +{ + "name": "resnet50_imagenet100_locked_extension_seed1011", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "imagenet100", + "resnet50", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_030010_64241279/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed123.yaml new file mode 100644 index 00000000..acb662c0 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed123.yaml @@ -0,0 +1,327 @@ +{ + "name": "resnet50_imagenet100_locked_extension_seed123", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "imagenet100", + "resnet50", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023258_64241273/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed42.yaml new file mode 100644 index 00000000..629fb7c9 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed42.yaml @@ -0,0 +1,327 @@ +{ + "name": "resnet50_imagenet100_locked_extension_seed42", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "imagenet100", + "resnet50", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023248_64241271/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed456.yaml new file mode 100644 index 00000000..4dbe273d --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed456.yaml @@ -0,0 +1,327 @@ +{ + "name": "resnet50_imagenet100_locked_extension_seed456", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "imagenet100", + "resnet50", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023256_64241275/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed789.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed789.yaml new file mode 100644 index 00000000..4bde67d4 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/resnet50_imagenet100_locked_extension_seed789.yaml @@ -0,0 +1,327 @@ +{ + "name": "resnet50_imagenet100_locked_extension_seed789", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "imagenet100", + "resnet50", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_025559_64241277/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed123.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed123.yaml new file mode 100644 index 00000000..1396b9b3 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed123.yaml @@ -0,0 +1,429 @@ +{ + "name": "vgg16_cifar10_locked_extension_seed123", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar10_cluster_analysis_20260126_123830_57082556/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed42.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed42.yaml new file mode 100644 index 00000000..c15affdf --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed42.yaml @@ -0,0 +1,429 @@ +{ + "name": "vgg16_cifar10_locked_extension_seed42", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar10_cluster_analysis_20260126_123831_57082555/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed456.yaml b/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed456.yaml new file mode 100644 index 00000000..663a2617 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_benchmark_extension/vgg16_cifar10_locked_extension_seed456.yaml @@ -0,0 +1,429 @@ +{ + "name": "vgg16_cifar10_locked_extension_seed456", + "description": "Locked benchmark extension for NeurIPS two-axis consistency checks", + "tags": [ + "paper2026", + "locked_benchmark_extension", + "cifar10", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar10_cluster_analysis_20260126_123830_57082552/checkpoints/trained_model.pth", + "dataset_name": "cifar10", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 50, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid", + "composite_pid_no_red", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": { + "enabled": true, + "accuracy_vs_sparsity": true, + "accuracy_vs_flops": true, + "accuracy_vs_params": true, + "methods_to_compare": [ + "random", + "magnitude", + "taylor", + "composite", + "cluster_aware", + "network_slimming" + ] + }, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": { + "enabled": true, + "by_layer": true, + "by_cluster": true + }, + "layer_importance_heatmap": true, + "sensitivity_curves": true, + "efficiency_tradeoffs": { + "enabled": true, + "accuracy_vs_flops": true, + "accuracy_vs_latency": true, + "accuracy_vs_params": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "redundancy", + "synergy" + ], + [ + "magnitude", + "rayleigh_quotient" + ], + [ + "magnitude", + "taylor" + ], + [ + "taylor", + "rayleigh_quotient" + ] + ], + "save_plots": true, + "cluster_analysis": { + "enabled": true, + "scatter_3d": true, + "cluster_evolution_by_layer": true, + "cluster_purity": true + }, + "layer_importance": { + "enabled": true, + "heatmap": true, + "bar_chart": true + }, + "fine_tuning_recovery": { + "enabled": true, + "by_method": true, + "by_sparsity": true + } + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar10/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar10", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_benchmark_extension_20260407", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": { + "layer_indices": "all", + "save_scores": true, + "generate_plots": true, + "metrics": [ + "rayleigh_quotient", + "redundancy", + "synergy", + "magnitude", + "taylor", + "activation_sparsity" + ], + "plots": { + "histograms": true, + "scatter_plots": true, + "pruning_curves": true, + "layer_comparison": true, + "filter_correlation": true + }, + "scatter_pairs": [ + [ + "rayleigh_quotient", + "redundancy" + ], + [ + "rayleigh_quotient", + "synergy" + ], + [ + "magnitude", + "taylor" + ], + [ + "redundancy", + "synergy" + ] + ] + }, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed1011.yaml new file mode 100644 index 00000000..9a34d062 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed1011.yaml @@ -0,0 +1,326 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_hybrid_seed1011", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed123.yaml new file mode 100644 index 00000000..eca88b28 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed123.yaml @@ -0,0 +1,326 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_hybrid_seed123", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed42.yaml new file mode 100644 index 00000000..22fae0f7 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed42.yaml @@ -0,0 +1,326 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_hybrid_seed42", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed456.yaml new file mode 100644 index 00000000..e2a57a2f --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed456.yaml @@ -0,0 +1,326 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_hybrid_seed456", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed789.yaml new file mode 100644 index 00000000..e96aeefc --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_hybrid_seed789.yaml @@ -0,0 +1,326 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_hybrid_seed789", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed1011.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed1011.yaml new file mode 100644 index 00000000..50e4f9c8 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed1011.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_main_seed1011", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed123.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed123.yaml new file mode 100644 index 00000000..5181a4c5 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed123.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_main_seed123", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed42.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed42.yaml new file mode 100644 index 00000000..1b79366d --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed42.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_main_seed42", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed456.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed456.yaml new file mode 100644 index 00000000..4624df66 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed456.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_main_seed456", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed789.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed789.yaml new file mode 100644 index 00000000..72d11957 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/mobilenetv2_cifar100_locked_headline_main_seed789.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_locked_headline_main_seed789", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed1011.yaml new file mode 100644 index 00000000..4192656a --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed1011.yaml @@ -0,0 +1,293 @@ +{ + "name": "resnet18_cifar100_locked_headline_hybrid_seed1011", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed123.yaml new file mode 100644 index 00000000..582ba86f --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed123.yaml @@ -0,0 +1,293 @@ +{ + "name": "resnet18_cifar100_locked_headline_hybrid_seed123", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed42.yaml new file mode 100644 index 00000000..1572da69 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed42.yaml @@ -0,0 +1,293 @@ +{ + "name": "resnet18_cifar100_locked_headline_hybrid_seed42", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed456.yaml new file mode 100644 index 00000000..cba9737c --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed456.yaml @@ -0,0 +1,293 @@ +{ + "name": "resnet18_cifar100_locked_headline_hybrid_seed456", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed789.yaml new file mode 100644 index 00000000..959eec06 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_hybrid_seed789.yaml @@ -0,0 +1,293 @@ +{ + "name": "resnet18_cifar100_locked_headline_hybrid_seed789", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed1011.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed1011.yaml new file mode 100644 index 00000000..9008402d --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_locked_headline_main_seed1011", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed123.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed123.yaml new file mode 100644 index 00000000..0032f854 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_locked_headline_main_seed123", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed42.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed42.yaml new file mode 100644 index 00000000..1f0c7aaf --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_locked_headline_main_seed42", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed456.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed456.yaml new file mode 100644 index 00000000..6196d7c5 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_locked_headline_main_seed456", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed789.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed789.yaml new file mode 100644 index 00000000..c20576ac --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/resnet18_cifar100_locked_headline_main_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_locked_headline_main_seed789", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed1011.yaml new file mode 100644 index 00000000..2468665e --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed1011.yaml @@ -0,0 +1,293 @@ +{ + "name": "vgg16_cifar100_locked_headline_hybrid_seed1011", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed123.yaml new file mode 100644 index 00000000..53ac45b6 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed123.yaml @@ -0,0 +1,293 @@ +{ + "name": "vgg16_cifar100_locked_headline_hybrid_seed123", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed42.yaml new file mode 100644 index 00000000..681887fb --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed42.yaml @@ -0,0 +1,293 @@ +{ + "name": "vgg16_cifar100_locked_headline_hybrid_seed42", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed456.yaml new file mode 100644 index 00000000..f2020aa9 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed456.yaml @@ -0,0 +1,293 @@ +{ + "name": "vgg16_cifar100_locked_headline_hybrid_seed456", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed789.yaml new file mode 100644 index 00000000..7caabe30 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_hybrid_seed789.yaml @@ -0,0 +1,293 @@ +{ + "name": "vgg16_cifar100_locked_headline_hybrid_seed789", + "description": "Locked unified rerun of Taylor-allocation hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "hybrid", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "cluster_aware_stratified_spectral_pid_rs_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed1011.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed1011.yaml new file mode 100644 index 00000000..0a570b9d --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_locked_headline_main_seed1011", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed123.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed123.yaml new file mode 100644 index 00000000..2d822f1a --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_locked_headline_main_seed123", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed42.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed42.yaml new file mode 100644 index 00000000..ca9c818d --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_locked_headline_main_seed42", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed456.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed456.yaml new file mode 100644 index 00000000..72579798 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_locked_headline_main_seed456", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed789.yaml b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed789.yaml new file mode 100644 index 00000000..9fee9317 --- /dev/null +++ b/configs/vision_prune/paper_2026_locked_headline_reruns/vgg16_cifar100_locked_headline_main_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_locked_headline_main_seed789", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "locked_headline_rerun", + "main", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "cluster_aware_spectral_rs_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "cluster_aware_stratified_spectral_pid_rs_ixy", + "cluster_aware_stratified_spectral_pid_rt_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_locked_headline_20260309", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml new file mode 100644 index 00000000..f3fdd27f --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed1011", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed123.yaml new file mode 100644 index 00000000..4985a7b2 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed123.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed123", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed42.yaml new file mode 100644 index 00000000..d8092185 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed42.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed42", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed456.yaml new file mode 100644 index 00000000..2b4fce96 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed456.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed456", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed789.yaml new file mode 100644 index 00000000..ff617186 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_global_seed789.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed789", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml new file mode 100644 index 00000000..0cd5c44f --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed1011", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml new file mode 100644 index 00000000..e5e02cd1 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed123", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml new file mode 100644 index 00000000..6cb93797 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed42", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml new file mode 100644 index 00000000..28b19991 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed456", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml new file mode 100644 index 00000000..f18d53b4 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed789", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml new file mode 100644 index 00000000..fc4587ec --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed1011", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml new file mode 100644 index 00000000..31ce959a --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed123", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml new file mode 100644 index 00000000..0e6e3f48 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed42", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml new file mode 100644 index 00000000..9c866696 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed456", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml new file mode 100644 index 00000000..c5abcb5b --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed789", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed1011.yaml new file mode 100644 index 00000000..527b86d8 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed1011", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed123.yaml new file mode 100644 index 00000000..8fe111ca --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed123", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed42.yaml new file mode 100644 index 00000000..8e036dde --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed42", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed456.yaml new file mode 100644 index 00000000..267beaaa --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed456", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed789.yaml new file mode 100644 index 00000000..c7ee5ab9 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_global_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed789", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml new file mode 100644 index 00000000..332ad4a0 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed1011", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed123.yaml new file mode 100644 index 00000000..8a0f7fdc --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed123.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed123", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed42.yaml new file mode 100644 index 00000000..03c3e395 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed42.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed42", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed456.yaml new file mode 100644 index 00000000..1cf26573 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed456.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed456", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed789.yaml new file mode 100644 index 00000000..f7b7f144 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_hybrid_seed789.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed789", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed1011.yaml new file mode 100644 index 00000000..a9af8525 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed1011", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed123.yaml new file mode 100644 index 00000000..2d8ffa22 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed123", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed42.yaml new file mode 100644 index 00000000..60090a38 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed42", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed456.yaml new file mode 100644 index 00000000..6393b81e --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed456", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed789.yaml new file mode 100644 index 00000000..b36469f9 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/resnet18_cifar100_metric_usage_uniform_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed789", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed1011.yaml new file mode 100644 index 00000000..fb76f3e5 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed1011", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed123.yaml new file mode 100644 index 00000000..ea0e75e9 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed123", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed42.yaml new file mode 100644 index 00000000..d070dfc1 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed42", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed456.yaml new file mode 100644 index 00000000..bebc7d4c --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed456", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed789.yaml new file mode 100644 index 00000000..0fdd3018 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_global_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed789", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml new file mode 100644 index 00000000..60757b9c --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed1011", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed123.yaml new file mode 100644 index 00000000..13072e05 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed123.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed123", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed42.yaml new file mode 100644 index 00000000..e1f0fbf2 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed42.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed42", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed456.yaml new file mode 100644 index 00000000..d3e62c8e --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed456.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed456", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed789.yaml new file mode 100644 index 00000000..3fc95fd3 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_hybrid_seed789.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed789", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed1011.yaml new file mode 100644 index 00000000..18667e52 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed1011", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed123.yaml new file mode 100644 index 00000000..d798d813 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed123", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed42.yaml new file mode 100644 index 00000000..5cdcad17 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed42", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed456.yaml new file mode 100644 index 00000000..b7f7d91b --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed456", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed789.yaml new file mode 100644 index 00000000..ae19a425 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation/vgg16_cifar100_metric_usage_uniform_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed789", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_20260311", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml new file mode 100644 index 00000000..109728f7 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed1011.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed1011", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed123.yaml new file mode 100644 index 00000000..62fe345a --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed123.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed123", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed42.yaml new file mode 100644 index 00000000..2d66f7da --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed42.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed42", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed456.yaml new file mode 100644 index 00000000..530d3c4c --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed456.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed456", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed789.yaml new file mode 100644 index 00000000..904e4a1b --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_global_seed789.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_global_seed789", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml new file mode 100644 index 00000000..b21bbd36 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed1011.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed1011", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml new file mode 100644 index 00000000..71ef774d --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed123.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed123", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml new file mode 100644 index 00000000..a684e39c --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed42.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed42", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml new file mode 100644 index 00000000..9eafc94f --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed456.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed456", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml new file mode 100644 index 00000000..041c1a83 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_hybrid_seed789.yaml @@ -0,0 +1,329 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_hybrid_seed789", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml new file mode 100644 index 00000000..4083b728 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed1011.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed1011", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml new file mode 100644 index 00000000..269b67c0 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed123.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed123", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml new file mode 100644 index 00000000..bb76e8cf --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed42.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed42", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml new file mode 100644 index 00000000..9a03dfe5 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed456.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed456", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml new file mode 100644 index 00000000..e8578951 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/mobilenetv2_cifar100_metric_usage_uniform_seed789.yaml @@ -0,0 +1,331 @@ +{ + "name": "mobilenetv2_cifar100_metric_usage_uniform_seed789", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed1011.yaml new file mode 100644 index 00000000..4dd72595 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed1011", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed123.yaml new file mode 100644 index 00000000..ff4421f1 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed123", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed42.yaml new file mode 100644 index 00000000..925e00c3 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed42", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed456.yaml new file mode 100644 index 00000000..417e2a82 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed456", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed789.yaml new file mode 100644 index 00000000..dba73fc0 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_global_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_global_seed789", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml new file mode 100644 index 00000000..d77881eb --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed1011.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed1011", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed123.yaml new file mode 100644 index 00000000..f4bb11fc --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed123.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed123", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed42.yaml new file mode 100644 index 00000000..35e9d55a --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed42.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed42", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed456.yaml new file mode 100644 index 00000000..6c6bd89f --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed456.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed456", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed789.yaml new file mode 100644 index 00000000..9f7ee196 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_hybrid_seed789.yaml @@ -0,0 +1,296 @@ +{ + "name": "resnet18_cifar100_metric_usage_hybrid_seed789", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed1011.yaml new file mode 100644 index 00000000..92fa6d2a --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed1011", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed123.yaml new file mode 100644 index 00000000..dc7b6701 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed123", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed42.yaml new file mode 100644 index 00000000..86effc92 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed42", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed456.yaml new file mode 100644 index 00000000..fce696bf --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed456", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed789.yaml new file mode 100644 index 00000000..d9f90c19 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/resnet18_cifar100_metric_usage_uniform_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "resnet18_cifar100_metric_usage_uniform_seed789", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed1011.yaml new file mode 100644 index 00000000..0ed9eee5 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed1011", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed123.yaml new file mode 100644 index 00000000..39d2d24d --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed123", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed42.yaml new file mode 100644 index 00000000..3ef8cbe3 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed42", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed456.yaml new file mode 100644 index 00000000..a1cc1c24 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed456", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed789.yaml new file mode 100644 index 00000000..b1fa1d32 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_global_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_global_seed789", + "description": "Locked ranking-only sweep with global-threshold allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "global_threshold", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml new file mode 100644 index 00000000..15f4a63a --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed1011.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed1011", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed123.yaml new file mode 100644 index 00000000..4ec2e4d1 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed123.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed123", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed42.yaml new file mode 100644 index 00000000..6a765ea1 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed42.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed42", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed456.yaml new file mode 100644 index 00000000..fae98489 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed456.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed456", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed789.yaml new file mode 100644 index 00000000..2c764a77 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_hybrid_seed789.yaml @@ -0,0 +1,296 @@ +{ + "name": "vgg16_cifar100_metric_usage_hybrid_seed789", + "description": "Locked ranking sweep with Taylor-based cross-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "hybrid_taylor_allocation", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "ixy_minus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": true, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed1011.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed1011.yaml new file mode 100644 index 00000000..6c66ab2d --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed1011.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed1011", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed123.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed123.yaml new file mode 100644 index 00000000..e43f5771 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed123.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed123", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed42.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed42.yaml new file mode 100644 index 00000000..cf6eab37 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed42.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed42", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed456.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed456.yaml new file mode 100644 index 00000000..5ac300b5 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed456.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed456", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed789.yaml b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed789.yaml new file mode 100644 index 00000000..770cc757 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_usage_allocation_fixmissing/vgg16_cifar100_metric_usage_uniform_seed789.yaml @@ -0,0 +1,298 @@ +{ + "name": "vgg16_cifar100_metric_usage_uniform_seed789", + "description": "Locked ranking-only sweep with uniform per-layer allocation", + "tags": [ + "paper2026", + "metric_usage_allocation", + "uniform", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "taylor", + "composite_ixy", + "ixy_minus_red", + "ixy_plus_red", + "magnitude_plus_ixy", + "magnitude_minus_red", + "composite_pid_no_red", + "composite_pid", + "cluster_aware_quantile_gradient_weighted_ixy" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_usage_allocation_fixmissing_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml new file mode 100644 index 00000000..9fb945bf --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed1011", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml new file mode 100644 index 00000000..edc698fb --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed123", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml new file mode 100644 index 00000000..957395bb --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed42", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml new file mode 100644 index 00000000..5e44919b --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed456", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml new file mode 100644 index 00000000..fba48bc1 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed789", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed1011.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed1011.yaml new file mode 100644 index 00000000..7ad315de --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed1011.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed1011", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed123.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed123.yaml new file mode 100644 index 00000000..8dc80c82 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed123.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed123", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed42.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed42.yaml new file mode 100644 index 00000000..27cfd720 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed42.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed42", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed456.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed456.yaml new file mode 100644 index 00000000..d2b840e0 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed456.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed456", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed789.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed789.yaml new file mode 100644 index 00000000..27309d8e --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/resnet18_cifar100_metric_weight_sweep_seed789.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed789", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed1011.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed1011.yaml new file mode 100644 index 00000000..0c943812 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed1011.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed1011", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed123.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed123.yaml new file mode 100644 index 00000000..bc9b241e --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed123.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed123", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed42.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed42.yaml new file mode 100644 index 00000000..97d5f51d --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed42.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed42", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed456.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed456.yaml new file mode 100644 index 00000000..4607ada3 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed456.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed456", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed789.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed789.yaml new file mode 100644 index 00000000..c8f2d1c1 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep/vgg16_cifar100_metric_weight_sweep_seed789.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed789", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml new file mode 100644 index 00000000..b71c15ef --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed1011.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed1011", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084525_57211530/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml new file mode 100644 index 00000000..8177aa2b --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed123.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed123", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml new file mode 100644 index 00000000..cc392477 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed42.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed42", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml new file mode 100644 index 00000000..2063aed4 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed456.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed456", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml new file mode 100644 index 00000000..38465afd --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/mobilenetv2_cifar100_metric_weight_sweep_seed789.yaml @@ -0,0 +1,332 @@ +{ + "name": "mobilenetv2_cifar100_metric_weight_sweep_seed789", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "mobilenetv2", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084522_57217372/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed1011.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed1011.yaml new file mode 100644 index 00000000..e614c988 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed1011.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed1011", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080111_57211528/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed123.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed123.yaml new file mode 100644 index 00000000..5e72ed62 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed123.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed123", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed42.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed42.yaml new file mode 100644 index 00000000..62490fd5 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed42.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed42", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed456.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed456.yaml new file mode 100644 index 00000000..f4fa80f9 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed456.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed456", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed789.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed789.yaml new file mode 100644 index 00000000..63ee95fb --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/resnet18_cifar100_metric_weight_sweep_seed789.yaml @@ -0,0 +1,299 @@ +{ + "name": "resnet18_cifar100_metric_weight_sweep_seed789", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "resnet18", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080112_57211572/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed1011.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed1011.yaml new file mode 100644 index 00000000..69742065 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed1011.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed1011", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed1011" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080043_57211529/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 1011, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed123.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed123.yaml new file mode 100644 index 00000000..5613945f --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed123.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed123", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed42.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed42.yaml new file mode 100644 index 00000000..c8a0fe96 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed42.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed42", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed456.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed456.yaml new file mode 100644 index 00000000..2fb0ae98 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed456.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed456", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed789.yaml b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed789.yaml new file mode 100644 index 00000000..aff14ac9 --- /dev/null +++ b/configs/vision_prune/paper_2026_metric_weight_sweep_fixdispatch/vgg16_cifar100_metric_weight_sweep_seed789.yaml @@ -0,0 +1,299 @@ +{ + "name": "vgg16_cifar100_metric_weight_sweep_seed789", + "description": "Locked uniform-allocation sweep for magnitude/IXY/redundancy weightings", + "tags": [ + "paper2026", + "metric_weight_sweep", + "vgg16", + "seed789" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211573/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 789, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "magnitude", + "composite_ixy", + "cluster_aware_quantile_gradient_weighted_ixy", + "composite_pid_no_red", + "magnitude_plus_ixy_w025", + "magnitude_plus_ixy_w050", + "magnitude_plus_ixy_w075", + "magnitude_plus_ixy_w100", + "ixy_minus_red_w015", + "ixy_minus_red_w030", + "ixy_minus_red_w050", + "ixy_minus_red_w070" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/importance_clustering_metric_weight_sweep_fixdispatch_20260312", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed123.yaml b/configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed123.yaml new file mode 100644 index 00000000..757eb74c --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed123.yaml @@ -0,0 +1,326 @@ +{ + "name": "resnet50_imagenet100_two_axis_scaleup_seed123", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "two_axis_scaleup", + "resnet50", + "imagenet100", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023258_64241273/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_scaleup_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed42.yaml b/configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed42.yaml new file mode 100644 index 00000000..b383554b --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_scaleup/resnet50_imagenet100_two_axis_seed42.yaml @@ -0,0 +1,326 @@ +{ + "name": "resnet50_imagenet100_two_axis_scaleup_seed42", + "description": "Locked benchmark breadth suite for NeurIPS two-axis appendix controls", + "tags": [ + "paper2026", + "two_axis_scaleup", + "resnet50", + "imagenet100", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet50", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet50_imagenet100_cluster_analysis_mb200_20260306_023248_64241271/checkpoints/trained_model.pth", + "dataset_name": "imagenet100", + "dataset_config": {}, + "data_path": "./data/imagenet100", + "batch_size": 64, + "num_workers": 8, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 30, + "learning_rate": 0.001, + "optimizer": "adam", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0001, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 512, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.1, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.3, + 0.5, + 0.7, + 0.8, + 0.9 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 1e-05, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true, + "layer_importance_heatmap": true, + "sensitivity_curves": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet50_imagenet100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet50_imagenet100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_scaleup_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed123.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed123.yaml new file mode 100644 index 00000000..02611f0f --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed123.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_cifar100_two_axis_sweep_seed123", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "mobilenetv2", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084458_57217336/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed42.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed42.yaml new file mode 100644 index 00000000..01bd5c86 --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed42.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_cifar100_two_axis_sweep_seed42", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "mobilenetv2", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_080037_57211589/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed456.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed456.yaml new file mode 100644 index 00000000..109263ec --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/mobilenetv2_cifar100_two_axis_seed456.yaml @@ -0,0 +1,328 @@ +{ + "name": "mobilenetv2_cifar100_two_axis_sweep_seed456", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "mobilenetv2", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "mobilenet_v2", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/mobilenetv2_cifar100_cluster_analysis_20260127_084514_57217362/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.01, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "activation_l2_norm": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + }, + "composite_weights": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "activation_l2_norm", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": { + "rayleigh_quotient": 0.33, + "gaussian_mi_analytic": -0.33, + "synergy_gaussian_mmi": 0.33 + }, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": false, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "uniform", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.9, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 1e-05, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": true, + "pruning_skip_depthwise": true, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": { + "enabled": true, + "format": "pdf", + "dpi": 300, + "style": "seaborn-v0_8-paper", + "histograms": true, + "violin_plots": true, + "correlation_heatmap": true, + "cluster_scatter": true, + "cluster_evolution": true, + "influence_matrix": true, + "halo_properties": true, + "pruning_comparison": true, + "pruning_recovery": true, + "cascade_test": true, + "metric_distributions": true + }, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/mobilenetv2_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/mobilenetv2_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed123.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed123.yaml new file mode 100644 index 00000000..0171e377 --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed123.yaml @@ -0,0 +1,295 @@ +{ + "name": "resnet18_cifar100_two_axis_sweep_seed123", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "resnet18", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080045_57211555/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed42.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed42.yaml new file mode 100644 index 00000000..cc4d0152 --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed42.yaml @@ -0,0 +1,295 @@ +{ + "name": "resnet18_cifar100_two_axis_sweep_seed42", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "resnet18", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080032_57211546/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed456.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed456.yaml new file mode 100644 index 00000000..74d6d911 --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/resnet18_cifar100_two_axis_seed456.yaml @@ -0,0 +1,295 @@ +{ + "name": "resnet18_cifar100_two_axis_sweep_seed456", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "resnet18", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "resnet18", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/resnet18_cifar100_cluster_analysis_20260127_080106_57211563/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.1, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.7, + "cluster_aware_anneal_end": 0.9, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0005, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": true, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/resnet18_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/resnet18_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed123.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed123.yaml new file mode 100644 index 00000000..1ba6e717 --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed123.yaml @@ -0,0 +1,295 @@ +{ + "name": "vgg16_cifar100_two_axis_sweep_seed123", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "vgg16", + "seed123" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080034_57211556/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 123, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed42.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed42.yaml new file mode 100644 index 00000000..5267243f --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed42.yaml @@ -0,0 +1,295 @@ +{ + "name": "vgg16_cifar100_two_axis_sweep_seed42", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "vgg16", + "seed42" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080032_57211547/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 42, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed456.yaml b/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed456.yaml new file mode 100644 index 00000000..7e771469 --- /dev/null +++ b/configs/vision_prune/paper_2026_two_axis_sweep/vgg16_cifar100_two_axis_seed456.yaml @@ -0,0 +1,295 @@ +{ + "name": "vgg16_cifar100_two_axis_sweep_seed456", + "description": "Locked unified rerun of headline non-hybrid pruning methods", + "tags": [ + "paper2026", + "two_axis_sweep", + "vgg16", + "seed456" + ], + "experiment_type": "cluster_analysis", + "model_name": "vgg16_bn", + "model_config": {}, + "pretrained": true, + "model_checkpoint": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/vgg16_cifar100_cluster_analysis_20260127_080038_57211564/checkpoints/trained_model.pth", + "dataset_name": "cifar100", + "dataset_config": {}, + "data_path": "./data", + "batch_size": 128, + "num_workers": 4, + "device": "cuda", + "seed": 456, + "train_before_dropout": false, + "training_epochs": 100, + "learning_rate": 0.05, + "optimizer": "sgd", + "scheduler": "cosine", + "scheduler_config": {}, + "weight_decay": 0.0005, + "momentum": 0.9, + "num_networks": 1, + "do_train": false, + "metrics": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "metric_configs": { + "rayleigh_quotient": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "relative": false, + "definition": "both", + "shrinkage": true + }, + "gaussian_mi_analytic": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "sampling": "all" + }, + "synergy_gaussian_mmi": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "target": "logit_margin", + "num_pairs": 10, + "sampling": "top_k" + }, + "taylor": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000, + "criterion": "gradient_weight" + } + }, + "metric_optimization": { + "use_jit": false, + "use_gpu_acceleration": false, + "force_cpu_for_large_ops": true, + "cpu_threshold": 100000000 + }, + "tracked_layers": null, + "scale_by_norm": false, + "force_cpu_for_large_metric_ops": true, + "cnn_rq_aggregation_op": "mean", + "exclude_classification_layer": true, + "alignment_methods": [ + "rayleigh_quotient", + "gaussian_mi_analytic", + "synergy_gaussian_mmi", + "taylor" + ], + "compute_alignment": true, + "save_alignment_history": true, + "measure_alignment_during_training": true, + "alignment_frequency": 1, + "alignment_data_num_samples": 1, + "alignment_computation_texts": [], + "alignment_composite_weights": {}, + "supernode_config": {}, + "cnn_mode": "unfold", + "calibration_mode": "indices", + "calibration_num_workers": 0, + "n_calibration": 5000, + "simulate_post_train_shuffle_epochs": 0, + "simulate_post_train_include_eval": true, + "activation_point": "pre_bn", + "activation_samples": "flatten_spatial", + "task_activation_samples": "match", + "spatial_samples_per_image": 16, + "n_clusters": 4, + "synergy_target": "logit_margin", + "synergy_candidate_pool": 50, + "synergy_pairs": 10, + "type_mapping_mode": "global", + "run_metric_ablation": false, + "metric_ablations": [ + "all", + "rq_red", + "rq_syn", + "red_syn" + ], + "run_permutation_baseline": false, + "n_permutations": 100, + "compute_loss_proxy": true, + "loss_proxy_n_calibration": 1024, + "compute_within_layer_connectivity": true, + "within_layer_red_topk": 20, + "within_layer_syn_topk": 10, + "routing_bottleneck_topk": 5, + "outred_candidate_pool": 64, + "outred_topm": 8, + "bottleneck_protect_percentile": 95.0, + "halo_percentile": 90.0, + "use_activation_weight": true, + "cascade_n_remove": 5, + "damage_sample_frac": 0.2, + "taylor_samples": 1024, + "geometric_median_iters": 10, + "geometric_median_eps": 1e-08, + "hrank_images": 256, + "hrank_pool": 8, + "hrank_sv_eps": 0.001, + "cluster_aware_alpha": 1.0, + "cluster_aware_beta": 0.5, + "cluster_aware_gamma": 0.3, + "cluster_aware_lambda_halo": 0.5, + "cluster_aware_protect_critical_frac": 0.3, + "cluster_aware_anneal_start": 0.5, + "cluster_aware_anneal_end": 0.8, + "cluster_aware_taylor_weight": 0.3, + "cluster_aware_depth_adaptive": false, + "cluster_aware_early_alpha": 1.5, + "cluster_aware_early_gamma": 0.1, + "cluster_aware_late_alpha": 0.8, + "cluster_aware_late_gamma": 0.5, + "cluster_aware_early_layer_frac": 0.3, + "generalized_taylor_weight_rq": 1.0, + "generalized_taylor_weight_redundancy": 0.3, + "generalized_taylor_weight_synergy": 0.5, + "generalized_taylor_gradient_exponent": 1.0, + "generalized_taylor_activation_exponent": 1.0, + "generalized_taylor_redundancy_discount_beta": 1.0, + "generalized_taylor_synergy_boost_gamma": 0.5, + "generalized_taylor_critical_multiplier": 1.5, + "generalized_taylor_redundant_multiplier": 0.5, + "generalized_taylor_synergistic_multiplier": 1.2, + "generalized_taylor_background_multiplier": 0.8, + "generalized_taylor_gate_mode": "sigmoid", + "generalized_taylor_gate_temperature": 6.0, + "generalized_taylor_gate_bias": 0.5, + "generalized_taylor_gate_eps": 0.05, + "generalized_taylor_gate_min": 0.0, + "generalized_taylor_gate_include_cluster_multiplier": true, + "generalized_taylor_structural_eps": 0.1, + "generalized_taylor_rq_log_eps": 1e-10, + "generalized_taylor_grad_over_act_eps": 1e-08, + "generalized_taylor_lp_optimal_l2_reg": 0.01, + "do_dropout_analysis": false, + "do_eigenfeature_analysis": false, + "do_pruning_experiments": true, + "dropout_rates": [ + 0.0, + 0.1, + 0.3, + 0.5, + 0.7, + 0.9 + ], + "dropout_mode": "scaled", + "measure_expected_distribution": true, + "distribution_bins": 50, + "pruning_strategies": [ + "taylor", + "magnitude", + "composite_ixy", + "two_axis_a1_b0", + "two_axis_a1_b0p1", + "two_axis_a1_b0p25", + "two_axis_a1_b0p5", + "two_axis_a1_b1" + ], + "pruning_amounts": [ + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, + 0.95 + ], + "pruning_selection_mode": "low", + "fine_tune_after_pruning": true, + "fine_tune_epochs": 5, + "pruning_alignment_metric": "rayleigh_quotient", + "pruning_hybrid_alpha": 0.5, + "pruning_scope": "layer", + "pruning_distribution": "global_threshold", + "pruning_min_per_layer": 0.0, + "pruning_max_per_layer": 0.95, + "pruning_max_per_layer_sparsity_cap": 1.0, + "fine_tune_learning_rate": 0.0001, + "fine_tune_max_batches": 200, + "fine_tune_weight_decay": 0.0001, + "alignment_structured_pruning": false, + "cascading_direction": "forward", + "dependency_aware_pruning": false, + "pruning_target_layer": null, + "pruning_pointwise_only": false, + "pruning_skip_depthwise": false, + "generate_plots": false, + "plot_format": "png", + "plot_dpi": 300, + "visualization_options": {}, + "post_analysis": {}, + "checkpoint_dir": "./results/vision/vgg16_cifar100/checkpoints", + "checkpoint_interval": 1000, + "save_best": true, + "log_dir": "./results/vision/vgg16_cifar100", + "log_interval": 100, + "plots_dir": "./plots", + "experiment_dir": null, + "base_output_dir": "/n/holylfs06/LABS/kempner_project_b/Lab/alignment/alignment_red/PAPER/two_axis_sweep_20260417", + "wandb_project": null, + "wandb_entity": null, + "distributed": false, + "world_size": 1, + "rank": 0, + "do_perplexity_computation": false, + "evaluation_dataset": "wikitext", + "evaluation_num_samples": 100, + "evaluation_metrics": [ + "perplexity" + ], + "llm": {}, + "use_nvidia_fewshot": false, + "use_chain_of_thought": false, + "fewshot_settings": {}, + "do_directed_redundancy": true, + "do_connectivity_pruning": true, + "do_scar_metrics": false, + "do_attention_scar_metrics": false, + "scar_num_samples": 0, + "scar_max_length": 512, + "supernode": {}, + "supernode_robustness": {}, + "supernode_summary": {}, + "halo_analysis": { + "enabled": true, + "percentile": 90.0, + "use_activation_weight": true, + "compute_influence_matrix": true, + "permutation_baseline": { + "enabled": false, + "n_permutations": 100 + } + }, + "generalized_importance": {}, + "do_halo_analysis": true, + "do_generalized_importance": false, + "do_scar_optimal": false, + "do_random_supernode_ablation": false, + "do_supernode_hit_rate_sweep": false, + "supernode_hit_rate_sweep": {}, + "eval_batches": null, + "use_tensorized_training": true, + "use_tensorized_pruning": true, + "use_ultra_parallel_eval": true, + "tokenizer_kwargs": {}, + "model_kwargs": {}, + "analysis_options": {}, + "hybrid_taylor_allocation": false, + "fine_tune_type_aware_enabled": false, + "fine_tune_type_aware_methods": [], + "pruning_enforce_exact_global_channel_budget": false +} diff --git a/scripts/run_experiment.py b/scripts/run_experiment.py index 52df9a3e..c8acd24b 100644 --- a/scripts/run_experiment.py +++ b/scripts/run_experiment.py @@ -278,7 +278,7 @@ def _get_nested(obj, key, default): from alignment.dataops.datasets.unified_dataset import DATASET_CONFIGS from alignment.models.hub import adapt_model_for_dataset - model_name = str(cluster_config.model_name).lower() + requested_model_name = str(cluster_config.model_name).lower() dataset_name = str(cluster_config.dataset_name).lower() # Resolve num_classes: explicit model_config > dataset registry > legacy fallback @@ -292,31 +292,60 @@ def _get_nested(obj, key, default): pretrained = bool(getattr(cluster_config, "pretrained", True)) weights_name = model_cfg.get("weights", None) if isinstance(model_cfg, dict) else None - weights_arg = weights_name if pretrained else None - - # Map model_name to torchvision function (handles vgg16->vgg16_bn alias) - _TORCHVISION_MAP = { - "resnet18": ("resnet18", "IMAGENET1K_V1"), - "resnet50": ("resnet50", "IMAGENET1K_V1"), - "vgg16": ("vgg16_bn", "IMAGENET1K_V1"), - "mobilenetv2": ("mobilenet_v2", "IMAGENET1K_V1"), - "mobilenet_v2": ("mobilenet_v2", "IMAGENET1K_V1"), - "mobilenet": ("mobilenet_v2", "IMAGENET1K_V1"), - "alexnet": ("alexnet", "IMAGENET1K_V1"), - } - tv_key = None - for key in _TORCHVISION_MAP: - if key in model_name: - tv_key = key - break + # Allow both direct torchvision names (e.g. "resnet18") and the registry-style + # wrapper form model_name="torchvision_model", model_config.model_name="convnext_tiny". + if requested_model_name in {"torchvision_model", "torchvision"}: + resolved_model_name = str(model_cfg.get("model_name", "")).lower() + if not resolved_model_name: + raise ValueError("model_config.model_name must be set when model_name='torchvision_model'") + else: + resolved_model_name = requested_model_name - if tv_key is None: - raise ValueError(f"Unknown model: {model_name}. Supported: {list(_TORCHVISION_MAP.keys())}") + # Map common paper aliases to torchvision function names. + _TORCHVISION_ALIASES = { + "vgg16": "vgg16_bn", + "mobilenetv2": "mobilenet_v2", + "mobilenet": "mobilenet_v2", + } + tv_func_name = _TORCHVISION_ALIASES.get(resolved_model_name, resolved_model_name) + + if not hasattr(torchvision.models, tv_func_name): + supported = sorted( + { + "alexnet", + "convnext_tiny", + "mobilenet", + "mobilenet_v2", + "mobilenetv2", + "resnet18", + "resnet50", + "vgg16", + } + ) + raise ValueError(f"Unknown model: {resolved_model_name}. Supported: {supported}") - tv_func_name, default_weights = _TORCHVISION_MAP[tv_key] tv_func = getattr(torchvision.models, tv_func_name) - model = tv_func(weights=weights_arg or default_weights) + tv_model_kwargs = { + k: v for k, v in (model_cfg.items() if isinstance(model_cfg, dict) else []) if k not in {"model_name", "weights", "checkpoint", "num_classes"} + } + + if pretrained: + try: + if weights_name is not None: + model = tv_func(weights=weights_name, **tv_model_kwargs) + else: + model = tv_func(weights="DEFAULT", **tv_model_kwargs) + except Exception: + try: + model = tv_func(pretrained=True, **tv_model_kwargs) + except Exception: + model = tv_func(**tv_model_kwargs) + else: + try: + model = tv_func(weights=None, **tv_model_kwargs) + except Exception: + model = tv_func(pretrained=False, **tv_model_kwargs) # Adapt classifier head for target num_classes if int(num_classes) != 1000: @@ -330,7 +359,7 @@ def _get_nested(obj, key, default): # Adapt model stem for dataset resolution (CIFAR, Tiny-ImageNet, etc.) # This is now handled by a shared utility in src/alignment/models/hub.py - adapt_model_for_dataset(model, model_name, dataset_name, pretrained=pretrained) + adapt_model_for_dataset(model, resolved_model_name, dataset_name, pretrained=pretrained) # Optional: explicit checkpoint checkpoint_path = getattr(cluster_config, "model_checkpoint", None) or (model_cfg.get("checkpoint") if isinstance(model_cfg, dict) else None) @@ -437,7 +466,7 @@ def _get_nested(obj, key, default): torch.save( { "model_state_dict": model.state_dict(), - "model_name": model_name, + "model_name": resolved_model_name, "dataset_name": dataset_name, "num_classes": num_classes, }, diff --git a/src/alignment/analysis/clustering/metric_clustering.py b/src/alignment/analysis/clustering/metric_clustering.py index 5653f1ee..427477f2 100644 --- a/src/alignment/analysis/clustering/metric_clustering.py +++ b/src/alignment/analysis/clustering/metric_clustering.py @@ -5,6 +5,22 @@ import numpy as np +# Archetype names (formerly: critical, redundant, synergistic, background). +# New names avoid overloading PID terminology. +# Geometric meaning is unchanged: high I_X low R_X, high R_X, moderate I_X low R_X, low both. +TYPE_ESSENTIAL = "essential" # formerly "critical" +TYPE_SUBSTITUTABLE = "substitutable" # formerly "redundant" +TYPE_SPECIALIZED = "specialized" # formerly "synergistic" +TYPE_DORMANT = "dormant" # formerly "background" + +# Backward compatibility: map old names to new +_OLD_TO_NEW = { + "critical": TYPE_ESSENTIAL, + "redundant": TYPE_SUBSTITUTABLE, + "synergistic": TYPE_SPECIALIZED, + "background": TYPE_DORMANT, +} + try: from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score @@ -248,12 +264,12 @@ def _types_by_importance( """Assign type names by ranking clusters by mean importance score. Higher mean score -> higher-priority type: - rank 3 (highest) = "critical" - rank 2 = "synergistic" - rank 1 = "redundant" - rank 0 (lowest) = "background" + rank 3 (highest) = "essential" + rank 2 = "specialized" + rank 1 = "substitutable" + rank 0 (lowest) = "dormant" """ - type_names_ranked = ["background", "redundant", "synergistic", "critical"] + type_names_ranked = [TYPE_DORMANT, TYPE_SUBSTITUTABLE, TYPE_SPECIALIZED, TYPE_ESSENTIAL] scores = np.asarray(scores).flatten() mean_scores = [] for c in range(n_clusters): @@ -266,7 +282,7 @@ def _types_by_importance( if r < len(type_names_ranked): mapping[c] = type_names_ranked[r] else: - mapping[c] = "background" + mapping[c] = TYPE_DORMANT return mapping def _fit_quantile( @@ -310,10 +326,10 @@ def _fit_quantile( sil = 0.0 # Type mapping: quartile 0 = background (lowest), k-1 = critical (highest) - type_names_ranked = ["background", "redundant", "synergistic", "critical"] + type_names_ranked = [TYPE_DORMANT, TYPE_SUBSTITUTABLE, TYPE_SPECIALIZED, TYPE_ESSENTIAL] tm: Dict[int, str] = {} for i in range(k): - tm[i] = type_names_ranked[i] if i < len(type_names_ranked) else "background" + tm[i] = type_names_ranked[i] if i < len(type_names_ranked) else TYPE_DORMANT tc = {t: int((lab == k_id).sum()) for k_id, t in tm.items()} return ClusterResult( @@ -394,22 +410,22 @@ def _types_greedy(self, c: np.ndarray) -> Dict[int, str]: used = set() i = int(np.argmax(c[:, 0] - c[:, 1])) - m[i] = "critical" + m[i] = TYPE_ESSENTIAL used.add(i) rem = [j for j in range(len(c)) if j not in used] i = rem[int(np.argmax([c[j, 1] for j in rem]))] - m[i] = "redundant" + m[i] = TYPE_SUBSTITUTABLE used.add(i) rem = [j for j in range(len(c)) if j not in used] i = rem[int(np.argmax([c[j, 2] for j in rem]))] - m[i] = "synergistic" + m[i] = TYPE_SPECIALIZED used.add(i) for j in range(len(c)): if j not in m: - m[j] = "background" + m[j] = TYPE_DORMANT return m def _solve_global_assignment(self, scores: np.ndarray) -> Dict[int, str]: @@ -418,11 +434,11 @@ def _solve_global_assignment(self, scores: np.ndarray) -> Dict[int, str]: Args: scores: [n_clusters, 4] score matrix for - [critical, redundant, synergistic, background]. + [essential, substitutable, specialized, dormant]. """ import itertools - type_names = ["critical", "redundant", "synergistic", "background"] + type_names = [TYPE_ESSENTIAL, TYPE_SUBSTITUTABLE, TYPE_SPECIALIZED, TYPE_DORMANT] n = int(scores.shape[0]) best = None best_score = -1e30 @@ -442,7 +458,7 @@ def _solve_global_assignment(self, scores: np.ndarray) -> Dict[int, str]: # Any extra clusters (if n_clusters > 4) are treated as background. for j in range(n): if int(j) not in mapping: - mapping[int(j)] = "background" + mapping[int(j)] = TYPE_DORMANT return mapping def _scores_global_penalized( @@ -460,13 +476,13 @@ def _scores_global_penalized( w_syn = 1.0 if use_syn else 0.0 scores = np.zeros((len(c), 4), dtype=np.float64) - # critical: high rq, low red + # essential: high I_X, low R_X scores[:, 0] = (w_rq * c[:, 0]) - (w_red * c[:, 1]) - # redundant: high red (with mild penalty for high rq) + # substitutable: high R_X (with mild penalty for high I_X) scores[:, 1] = (w_red * c[:, 1]) - (0.25 * w_rq * c[:, 0]) - # synergistic: high syn (with mild penalty for high red) + # specialized: high complementarity (with mild penalty for high R_X) scores[:, 2] = (w_syn * c[:, 2]) - (0.25 * w_red * c[:, 1]) - # background: close to origin + # dormant: close to origin scores[:, 3] = -((w_rq * np.abs(c[:, 0])) + (w_red * np.abs(c[:, 1])) + (w_syn * np.abs(c[:, 2]))) return scores @@ -484,13 +500,13 @@ def _scores_global_simple( w_syn = 1.0 if use_syn else 0.0 scores = np.zeros((len(c), 4), dtype=np.float64) - # critical: high rq, low red + # essential: high I_X, low R_X scores[:, 0] = (w_rq * c[:, 0]) - (w_red * c[:, 1]) - # redundant: maximize redundancy + # substitutable: maximize shared information scores[:, 1] = w_red * c[:, 1] - # synergistic: maximize synergy + # specialized: maximize complementarity scores[:, 2] = w_syn * c[:, 2] - # background: low magnitude in active metric dimensions + # dormant: low magnitude in active metric dimensions scores[:, 3] = -((w_rq * np.abs(c[:, 0])) + (w_red * np.abs(c[:, 1])) + (w_syn * np.abs(c[:, 2]))) return scores diff --git a/src/alignment/analysis/visualization/cluster_plots.py b/src/alignment/analysis/visualization/cluster_plots.py index ad0be31d..7e4dcecc 100644 --- a/src/alignment/analysis/visualization/cluster_plots.py +++ b/src/alignment/analysis/visualization/cluster_plots.py @@ -466,6 +466,8 @@ def plot_centroid_evolution( ("red", "syn", "Redundancy", "Synergy"), ] + scatter = None + for ax, (x_key, y_key, x_label, y_label) in zip(axes, pairs): for ctype in types: traj = trajectories[ctype] @@ -497,8 +499,9 @@ def plot_centroid_evolution( ax.legend(loc="best", fontsize=9) # Add colorbar for depth - cbar = fig.colorbar(scatter, ax=axes, orientation="horizontal", fraction=0.05, pad=0.12, aspect=40) - cbar.set_label("Layer Depth", fontsize=11) + if scatter is not None: + cbar = fig.colorbar(scatter, ax=axes, orientation="horizontal", fraction=0.05, pad=0.12, aspect=40) + cbar.set_label("Layer Depth", fontsize=11) fig.suptitle("Cluster Centroid Evolution Across Network Depth\n(○ = early layers, ■ = late layers)", fontsize=13, fontweight="bold") plt.tight_layout(rect=[0, 0.08, 1, 0.95]) diff --git a/src/alignment/configs/config_loader.py b/src/alignment/configs/config_loader.py index c87037d8..d5644e31 100644 --- a/src/alignment/configs/config_loader.py +++ b/src/alignment/configs/config_loader.py @@ -893,7 +893,9 @@ def _map_nested_to_flat_config(nested_config: Dict[str, Any]) -> Dict[str, Any]: ) # Map training configuration - # Auto-disable training when using pretrained models (unless explicitly enabled) + # Preserve legacy flat training keys when present, and only fall back to + # pretrained-based defaults when the config truly leaves training + # unspecified. is_pretrained = flat_config.get("pretrained", False) if "training" in nested_config: @@ -926,6 +928,31 @@ def _map_nested_to_flat_config(nested_config: Dict[str, Any]) -> Dict[str, Any]: if is_pretrained: flat_config["do_train"] = False + # Legacy flat-format overrides. Many existing paper configs specify + # training controls at top level rather than under a nested `training` + # block. Respect those explicit values instead of replacing them with the + # pretrained default above. + if "do_train" in nested_config: + flat_config["do_train"] = nested_config["do_train"] + if "training_epochs" in nested_config: + flat_config["training_epochs"] = nested_config["training_epochs"] + if "learning_rate" in nested_config: + flat_config["learning_rate"] = nested_config["learning_rate"] + if "optimizer" in nested_config: + flat_config["optimizer"] = str(nested_config["optimizer"]).lower() + if "train_before_dropout" in nested_config: + flat_config["train_before_dropout"] = nested_config["train_before_dropout"] + if "scheduler" in nested_config: + flat_config["scheduler"] = nested_config["scheduler"] + if "scheduler_config" in nested_config: + flat_config["scheduler_config"] = nested_config["scheduler_config"] + if "momentum" in nested_config: + flat_config["momentum"] = nested_config["momentum"] + if "weight_decay" in nested_config: + flat_config["weight_decay"] = nested_config["weight_decay"] + if "num_networks" in nested_config: + flat_config["num_networks"] = nested_config["num_networks"] + # Map alignment/metrics settings # Priority: metrics.enabled > alignment.methods > alignment_methods > default flat_config["alignment_methods"] = nested_config.get("alignment_methods", ["rayleigh_quotient"]) diff --git a/src/alignment/experiments/cluster_experiments.py b/src/alignment/experiments/cluster_experiments.py index 4bb9fb8c..dbb1f0b5 100644 --- a/src/alignment/experiments/cluster_experiments.py +++ b/src/alignment/experiments/cluster_experiments.py @@ -951,9 +951,21 @@ def _bn_for_conv_name(conv_name: str): metrics["weight_norm_sq"] = weight_norm[:n_channels].astype(np.float64) metrics["activation_var"] = var_y[:n_channels].astype(np.float64) - # 1b) Input MI proxy (scale-sensitive): 0.5 * log(1 + RQ * ||w||^2 / sigma0^2) - # We use a per-layer reference sigma0^2 to make the proxy comparable across depth. - signal_power = (metrics["rq"] * weight_norm[:n_channels]).astype(np.float64) + # 1b) Input MI proxy (Gaussian channel MI): + # I_X(i) = 0.5 * log(1 + J_i / sigma0^2) + # where J_i = w_i^T Sigma_X w_i is signal power. + # + # signal_power = rq * ||w||^2. When rq_exact is active (covariance-based), + # this approximates w^T Sigma_X w directly. When rq_equiv is used + # (streaming Var(Y_postact) / ||w||^2), the product equals Var(Y_postact), + # which approximates w^T Sigma_X w up to nonlinearity effects (BN, ReLU). + # Rank-order between exact and streaming paths: Kendall tau > 0.99 (verified). + # + # Prefer rq_exact when available for the correct w^T Sigma_X w computation. + if metrics.get("rq_exact") is not None: + signal_power = (metrics["rq_exact"] * weight_norm[:n_channels]).astype(np.float64) + else: + signal_power = (metrics["rq"] * weight_norm[:n_channels]).astype(np.float64) sigma_mode = str(getattr(self.config, "mi_in_proxy_sigma_mode", "median") or "median") sigma_fixed = float(getattr(self.config, "mi_in_proxy_sigma_fixed", 0.0) or 0.0) sigma_quantile = float(getattr(self.config, "mi_in_proxy_sigma_quantile", 50.0) or 50.0) @@ -964,15 +976,19 @@ def _bn_for_conv_name(conv_name: str): sigma_quantile=sigma_quantile, ) metrics["mi_in_proxy"] = mi_in_proxy + metrics["signal_power"] = signal_power # J_i = w^T Sigma_X w (or streaming approx) metrics["mi_in_proxy_sigma0_sq"] = np.full(n_channels, sigma0_sq, dtype=np.float64) - # 2) Redundancy via Gaussian MI from correlations + # 2) Within-layer shared information via Gaussian MI from correlations + # R_X(i,j) = -0.5 * log(1 - rho_{ij}^2), per-channel: bar_R_X(i) = mean_j R_X(i,j) + # This is pairwise Gaussian MI, NOT a PID redundancy atom. denom = np.sqrt(np.outer(var_y, var_y)) + 1e-12 corr = cov_yy / denom corr = np.clip(corr, -0.999, 0.999) mi_matrix = -0.5 * np.log(1.0 - corr**2) np.fill_diagonal(mi_matrix, 0.0) - metrics["redundancy"] = mi_matrix.mean(axis=1).astype(np.float64) + metrics["redundancy"] = mi_matrix.mean(axis=1).astype(np.float64) # bar_R_X(i) + metrics["shared_info_internal"] = metrics["redundancy"] # alias (new name) # 3) TaskMI + Synergy with scalar target under Gaussian approximation (MMI) # @@ -3109,20 +3125,29 @@ def _compute_layer_scores_for_method(self, method: str, model: nn.Module) -> Dic if values is None: continue layer_scores[name] = torch.as_tensor(values, dtype=torch.float32, device=device) - elif method in { - "composite", - "composite_pos_red", - "composite_twoaxis", - "composite_twoaxis_ixy", - "composite_pid", - "composite_pid_no_red", - "composite_pid_unique", - "rq_minus_red", - "rq_plus_red", - "magnitude_plus_rq", - "magnitude_minus_red", - "magnitude_plus_red", - }: + elif ( + method + in { + "composite", + "composite_pos_red", + "composite_twoaxis", + "composite_twoaxis_ixy", + "composite_pid", + "composite_pid_no_red", + "composite_pid_unique", + "ixy_minus_red", + "ixy_plus_red", + "rq_minus_red", + "rq_plus_red", + "magnitude_plus_rq", + "magnitude_plus_ixy", + "magnitude_minus_red", + "magnitude_plus_red", + } + or method.startswith("magnitude_plus_ixy_w") + or method.startswith("ixy_minus_red_w") + or method.startswith("two_axis_a") + ): comp = self._compute_composite_metric(method, metrics, layer, depth_frac=depth_frac) if comp is not None: layer_scores[name] = comp.to(device) @@ -3315,6 +3340,14 @@ def normalize(arr: np.ndarray) -> np.ndarray: return np.zeros_like(arr) return (arr - min_v) / (max_v - min_v) + def parse_weight_suffix(name: str, prefix: str) -> Optional[float]: + if not name.startswith(prefix): + return None + suffix = name[len(prefix) :] + if not suffix.isdigit(): + return None + return float(int(suffix)) / 100.0 + rq_norm = normalize(rq) ixy_norm = normalize(ixy) red_norm = normalize(redundancy) @@ -3332,6 +3365,19 @@ def normalize(arr: np.ndarray) -> np.ndarray: scores = ixy_norm + 0.5 * syn_norm - 0.3 * red_norm elif method == "composite_ixy_pos_red": scores = ixy_norm + 0.5 * syn_norm + 0.3 * red_norm + # Two-axis Rule-1 score: alpha * I_X - beta * I(T;Y). + # Named variants two_axis_a{alpha}_b{beta}, with 'p' as decimal point. + # E.g. two_axis_a1_b0p25 = alpha=1.0, beta=0.25. + elif method.startswith("two_axis_a"): + try: + remainder = method[len("two_axis_a") :] + a_str, b_str = remainder.split("_b") + alpha = float(a_str.replace("p", ".")) + beta = float(b_str.replace("p", ".")) + except (ValueError, IndexError): + logger.warning(f"Could not parse two-axis variant '{method}', using alpha=1,beta=0") + alpha, beta = 1.0, 0.0 + scores = alpha * ixy_norm - beta * task_mi_norm # Depth-adaptive two-axis composite (score-only; no halo term) elif method in {"composite_twoaxis", "composite_twoaxis_ixy"}: # Local axis: first metric (RQ or IXY), internal redundancy, synergy. @@ -3390,6 +3436,12 @@ def normalize(arr: np.ndarray) -> np.ndarray: w = layer.weight.detach().view(layer.weight.shape[0], -1) mag = normalize(w.norm(p=2, dim=1).cpu().numpy()) scores = mag + 0.3 * red_norm + elif (weight := parse_weight_suffix(method, "magnitude_plus_ixy_w")) is not None: + w = layer.weight.detach().view(layer.weight.shape[0], -1) + mag = normalize(w.norm(p=2, dim=1).cpu().numpy()) + scores = mag + weight * ixy_norm + elif (weight := parse_weight_suffix(method, "ixy_minus_red_w")) is not None: + scores = ixy_norm - weight * red_norm else: return None diff --git a/src/alignment/experiments/llm_experiments.py b/src/alignment/experiments/llm_experiments.py index ff89a7e4..c4bbe8c3 100644 --- a/src/alignment/experiments/llm_experiments.py +++ b/src/alignment/experiments/llm_experiments.py @@ -2129,6 +2129,52 @@ def compute_importance_scores(self, num_samples: int = 1, dim="input") -> Dict[s self.importance_scores[layer_name] = layer_scores + # For FFN channel comparisons and structured pruning, down_proj should use + # the FFN-channel-side activation score (intermediate width), not the hidden + # output width of down_proj itself. Reuse sibling gate/up activation scores + # when they match the down_proj input width. + try: + module_map = dict(self.wrapped_model._model.named_modules()) + for layer_name, layer_scores in list(self.importance_scores.items()): + if "mlp.down_proj" not in layer_name: + continue + + layer_module = module_map.get(layer_name) + if layer_module is None or not hasattr(layer_module, "weight"): + continue + target_dim = int(layer_module.weight.shape[1]) + + act_scores = layer_scores.get("activation_l2_norm") + if torch.is_tensor(act_scores) and int(act_scores.numel()) == target_dim: + continue + + replacement = None + replacement_src = None + for sibling_proj in ("gate_proj", "up_proj"): + sibling_name = layer_name.replace("down_proj", sibling_proj) + sibling_scores = (self.importance_scores.get(sibling_name) or {}).get("activation_l2_norm") + if torch.is_tensor(sibling_scores) and int(sibling_scores.numel()) == target_dim: + replacement = sibling_scores.detach().clone() + replacement_src = sibling_name + break + + if replacement is not None: + layer_scores["activation_l2_norm"] = replacement + if getattr(self.config, "supernode", {}) or getattr(self.config, "supernode_config", {}): + try: + composite_score = layer_scores.get("composite") + self._apply_supernode_selection(layer_scores, composite_score if torch.is_tensor(composite_score) else None) + except Exception: + pass + self.importance_scores[layer_name] = layer_scores + logger.info( + "Aligned activation_l2_norm for %s using FFN-channel activations from %s", + layer_name, + replacement_src, + ) + except Exception as align_err: + logger.debug(f"Failed to align down_proj activation scores: {align_err}") + return self.importance_scores def compute_scar_supernode_metrics( @@ -3585,8 +3631,25 @@ def analyze_supernode_connections( if not layer_matches: continue - # Get the metric for supernode identification (configurable) + # Get the metric for supernode identification (configurable). + # SCAR metrics live in `scar_scores`, while baseline metrics such as + # activation_l2_norm live in `self.importance_scores`. + def _get_importance_metric(name: str, metric_name: str) -> Optional[torch.Tensor]: + candidates = [name] + normalized = name.replace("model.model.", "model.") + denormalized = name.replace("model.", "model.model.", 1) if name.startswith("model.") else name + for cand in (normalized, denormalized): + if cand not in candidates: + candidates.append(cand) + for cand in candidates: + metric_scores = (self.importance_scores.get(cand) or {}).get(metric_name) + if metric_scores is not None: + return metric_scores + return None + supernode_scores = layer_metrics.get(supernode_metric) + if supernode_scores is None: + supernode_scores = _get_importance_metric(layer_name, supernode_metric) if supernode_scores is None: # Fallback to activation power if requested metric not available supernode_scores = layer_metrics.get("scar_activation_power") @@ -3885,6 +3948,25 @@ def analyze_supernode_robustness( results = {} viz = UnifiedVisualizer() + def _lookup_importance_metric(layer_name: str, metric_name: str) -> Optional[torch.Tensor]: + # importance_scores keys can vary (model.layers vs model.model.layers, etc.) + candidates = [ + layer_name, + layer_name.replace("model.layers.", "model.model.layers."), + layer_name.replace("model.model.layers.", "model.layers."), + layer_name.replace("model.", ""), + ] + seen = set() + for key in candidates: + if key in seen: + continue + seen.add(key) + layer_scores = self.importance_scores.get(key) or {} + metric_scores = layer_scores.get(metric_name) + if metric_scores is not None: + return metric_scores + return None + for layer_name, layer_module in down_proj_layers: logger.info(f"\n Analyzing layer: {layer_name}") @@ -3929,10 +4011,11 @@ def analyze_supernode_robustness( # SCAR metrics from compute_scar_supernode_metrics if metric_name in scar_scores: metric_scores_layer[metric_name] = scar_scores[metric_name].float().cpu() - elif metric_name in self.importance_scores.get(layer_name, {}): - # Pre-computed importance scores - metric_scores_layer[metric_name] = self.importance_scores[layer_name][metric_name].float().cpu() else: + precomputed_scores = _lookup_importance_metric(layer_name, metric_name) + if precomputed_scores is not None: + metric_scores_layer[metric_name] = precomputed_scores.float().cpu() + continue # Try computing on the fly try: if metric_name == "activation_l2_norm": @@ -4125,25 +4208,27 @@ def analyze_supernode_robustness( # Summary statistics across all layers # ========================================================= if results: + jaccard_means = [] + spearman_means = [] + stable_fracs = [] + for layer_result in results.values(): + n_metrics = len(layer_result.get("metrics_analyzed", [])) + if n_metrics >= 2 and "jaccard_matrix" in layer_result: + j_vals = np.array(layer_result["jaccard_matrix"])[np.triu_indices(n_metrics, k=1)] + if j_vals.size: + jaccard_means.append(float(np.mean(j_vals))) + if n_metrics >= 2 and "spearman_matrix" in layer_result: + s_vals = np.array(layer_result["spearman_matrix"])[np.triu_indices(n_metrics, k=1)] + if s_vals.size: + spearman_means.append(float(np.mean(s_vals))) + if "num_highly_stable" in layer_result and "num_supernodes" in layer_result and layer_result["num_supernodes"] > 0: + stable_fracs.append(float(layer_result["num_highly_stable"]) / float(layer_result["num_supernodes"])) + summary = { "num_layers_analyzed": len(results), - "avg_jaccard_across_metrics": np.mean( - [ - np.mean(np.array(r["jaccard_matrix"])[np.triu_indices(len(r["metrics_analyzed"]), k=1)]) - for r in results.values() - if "jaccard_matrix" in r - ] - ), - "avg_spearman_across_metrics": np.mean( - [ - np.mean(np.array(r["spearman_matrix"])[np.triu_indices(len(r["metrics_analyzed"]), k=1)]) - for r in results.values() - if "spearman_matrix" in r - ] - ), - "avg_highly_stable_fraction": np.mean( - [r["num_highly_stable"] / r["num_supernodes"] for r in results.values() if "num_highly_stable" in r] - ), + "avg_jaccard_across_metrics": float(np.mean(jaccard_means)) if jaccard_means else float("nan"), + "avg_spearman_across_metrics": float(np.mean(spearman_means)) if spearman_means else float("nan"), + "avg_highly_stable_fraction": float(np.mean(stable_fracs)) if stable_fracs else float("nan"), } results["summary"] = summary @@ -4202,6 +4287,8 @@ def hook_fn(module, input, output): with torch.no_grad(): for i in range(0, len(texts), batch_size): batch_texts = texts[i : i + batch_size] + if getattr(self.tokenizer, "pad_token_id", None) is None and getattr(self.tokenizer, "eos_token", None) is not None: + self.tokenizer.pad_token = self.tokenizer.eos_token inputs = self.tokenizer( batch_texts, return_tensors="pt", @@ -7802,8 +7889,15 @@ def apply_unstructured_baseline_pruning( if metric not in {"wanda_unstructured", "sparsegpt_unstructured"}: raise ValueError(f"Unknown unstructured baseline metric: {metric}") - # Ensure baseline calibrations exist - num_calib = getattr(self.config, "scar_num_samples", 128) + # Ensure baseline calibrations exist. Some runs disable SCAR metrics entirely, + # which can leave scar_num_samples at 0 even though a general calibration + # budget is configured elsewhere. + num_calib = ( + getattr(self.config, "scar_num_samples", None) + or getattr(self.config, "alignment_data_num_samples", None) + or getattr(self.config, "n_calibration", None) + or 128 + ) if metric == "wanda_unstructured": wanda = getattr(self, "_wanda_baseline", None) if wanda is None: @@ -7994,6 +8088,52 @@ def apply_pruning(self, sparsity: float = 0.2, metric: str = "activation_l2_norm super_pruned = 0 layers_with_super = 0 layers_with_super_pruned = 0 + nodes_total = 0 + nodes_pruned = 0 + both_total = 0 + both_pruned = 0 + + supernode_cfg = getattr(self.config, "supernode", {}) or getattr(self.config, "supernode_config", {}) or {} + + def _make_supernode_mask(metric_scores: torch.Tensor) -> torch.Tensor: + num_neurons = int(metric_scores.numel()) + if num_neurons <= 0: + return torch.zeros_like(metric_scores, dtype=torch.bool) + + top_k_cfg = supernode_cfg.get("top_k") + core_fraction = float(supernode_cfg.get("core_fraction", 0.1)) + min_core = max(1, int(supernode_cfg.get("min_core_neurons", 1))) + + if top_k_cfg is not None: + num_core = min(num_neurons, int(top_k_cfg)) + else: + num_core = max(1, int(round(core_fraction * num_neurons))) + + num_core = max(num_core, min_core) + num_core = min(num_core, num_neurons) + + _, top_indices = torch.topk(metric_scores, k=num_core, largest=True) + out = torch.zeros_like(metric_scores, dtype=torch.bool) + out[top_indices] = True + return out + + def _get_layer_metric_scores(base_layer_name: str, layer_idx: str, metric_name: str) -> Optional[torch.Tensor]: + key_candidates = [ + base_layer_name, + base_layer_name.replace("model.model.", "model."), + base_layer_name.replace("model.", "model.model.", 1), + f"model.layers.{layer_idx}.mlp.down_proj", + f"model.model.layers.{layer_idx}.mlp.down_proj", + f"model.layers.{layer_idx}.mlp.gate_proj", + f"model.model.layers.{layer_idx}.mlp.gate_proj", + f"model.layers.{layer_idx}.mlp.up_proj", + f"model.model.layers.{layer_idx}.mlp.up_proj", + ] + for cand in key_candidates: + metric_vals = (self.importance_scores.get(cand) or {}).get(metric_name) + if torch.is_tensor(metric_vals): + return metric_vals + return None for layer_name in self.importance_scores.keys(): if metric not in self.importance_scores[layer_name]: @@ -8058,6 +8198,12 @@ def apply_pruning(self, sparsity: float = 0.2, metric: str = "activation_l2_norm # Create mask based on importance scores mask = pruner.create_pruning_mask(scores) + try: + nodes_total += int(mask.numel()) + nodes_pruned += int((mask == 0).sum().item()) + except Exception: + pass + # Diagnostic: how many supernodes did we prune in this layer? if core_mask is not None: try: @@ -8078,6 +8224,21 @@ def apply_pruning(self, sparsity: float = 0.2, metric: str = "activation_l2_norm # Never fail pruning due to diagnostics. pass + try: + scar_lp_scores = _get_layer_metric_scores(layer_name, layer_idx, "scar_loss_proxy") + act_l2_scores = _get_layer_metric_scores(layer_name, layer_idx, "activation_l2_norm") + if torch.is_tensor(scar_lp_scores) and torch.is_tensor(act_l2_scores): + if scar_lp_scores.numel() == act_l2_scores.numel() == mask.numel(): + scar_mask = _make_supernode_mask(scar_lp_scores.to(device=mask.device, dtype=torch.float32)) + act_mask = _make_supernode_mask(act_l2_scores.to(device=mask.device, dtype=torch.float32)) + both_mask = scar_mask & act_mask + pruned = mask == 0 + + both_total += int(both_mask.sum().item()) + both_pruned += int((pruned & both_mask).sum().item()) + except Exception: + pass + # Get the MLP module - use underlying model to handle HFCausalLM wrapper underlying_model = self._get_underlying_model() module_dict = dict(underlying_model.named_modules()) @@ -8154,6 +8315,12 @@ def apply_pruning(self, sparsity: float = 0.2, metric: str = "activation_l2_norm "supernodes_total": int(super_total), "supernodes_pruned": int(super_pruned), "supernodes_pruned_frac": (float(super_pruned) / float(super_total)) if super_total > 0 else None, + "nodes_total": int(nodes_total), + "nodes_pruned": int(nodes_pruned), + "nodes_pruned_frac": (float(nodes_pruned) / float(nodes_total)) if nodes_total > 0 else None, + "supernodes_both_scar_lp_activation_l2_total": int(both_total), + "supernodes_both_scar_lp_activation_l2_pruned": int(both_pruned), + "supernodes_both_scar_lp_activation_l2_pruned_frac": (float(both_pruned) / float(both_total)) if both_total > 0 else None, "layers_with_supernodes": int(layers_with_super), "layers_with_supernodes_pruned": int(layers_with_super_pruned), } @@ -9024,9 +9191,15 @@ class _SkipScarVisualizations(Exception): logger.info(f"Checking baseline strategies: pruning_strategies={pruning_strategies}, baseline_strategies={baseline_strategies}") if baseline_strategies: try: + baseline_num_calib = ( + getattr(self.config, "scar_num_samples", None) + or getattr(self.config, "alignment_data_num_samples", None) + or getattr(self.config, "n_calibration", None) + or 128 + ) baseline_scores = self.compute_baseline_pruning_scores( strategies=baseline_strategies, - num_calibration_samples=getattr(self.config, "scar_num_samples", 128), + num_calibration_samples=baseline_num_calib, ) logger.info(f"Computed baseline pruning scores for {len(baseline_scores)} layers") except Exception as base_err: diff --git a/src/alignment/pruning/strategies/metric_based.py b/src/alignment/pruning/strategies/metric_based.py index 7838e5af..342cbe6e 100644 --- a/src/alignment/pruning/strategies/metric_based.py +++ b/src/alignment/pruning/strategies/metric_based.py @@ -60,6 +60,13 @@ class MetricPruningConfig(PruningConfig): synergistic_bonus: float = 0.2 # Bonus for synergistic channels background_penalty: float = 0.1 # Penalty for background channels + # Two-axis score (Rule 1): score = alpha * I_X - beta * I(T;Y) + # High score = keep; low score = prune. + # alpha=1, beta=0 recovers pure local I_X ranking. + # beta>0 penalises task-relevant channels (Q2 danger zone). + alpha_ix: float = 1.0 + beta_ity: float = 0.0 + # ============================================================================= # SINGLE METRIC PRUNING @@ -141,6 +148,18 @@ def compute_importance_scores( + self.config.weight_mi * self._normalize(tmi) ) + elif metric in {"two_axis", "ix_minus_ity", "two_axis_score"}: + # Rule 1 score: alpha * I_X - beta * I(T;Y). + # Uses the same mi_in_proxy field already populated by the metric pipeline + # (Gaussian input-capture proxy, i.e. I_X in the paper notation) and the + # task_mi field (task-direction MI). Both are normalized before combining + # so that alpha/beta have comparable scales. + ix = metrics.get("mi_in_proxy", np.zeros(n_channels)) + ity = metrics.get("task_mi", metrics.get("mi_task", np.zeros(n_channels))) + alpha = float(getattr(self.config, "alpha_ix", 1.0)) + beta = float(getattr(self.config, "beta_ity", 0.0)) + scores = alpha * self._normalize(ix) - beta * self._normalize(ity) + elif metric in {"magnitude", "mag"}: if hasattr(module, "weight"): w = module.weight.detach().view(n_channels, -1) @@ -510,7 +529,38 @@ def create_metric_pruning_strategy( method = method.lower() # Single metric methods - single_metrics = {"rq", "redundancy", "synergy", "mi", "mi_in", "magnitude", "composite"} + single_metrics = { + "rq", + "redundancy", + "synergy", + "mi", + "mi_in", + "magnitude", + "composite", + "two_axis", + "ix_minus_ity", + "two_axis_score", + } + + # Two-axis variants with embedded alpha/beta so configs can select them by + # name alone (no custom kwarg threading through the experiment runner). + # Convention: two_axis_aA_bB encodes alpha=A, beta=B with 'p' as decimal point. + # Examples: two_axis_a1_b0p25 (alpha=1, beta=0.25), two_axis_a1_b1 (alpha=1, beta=1). + if method.startswith("two_axis_a"): + try: + remainder = method[len("two_axis_a") :] + a_str, b_str = remainder.split("_b") + alpha = float(a_str.replace("p", ".")) + beta = float(b_str.replace("p", ".")) + config = MetricPruningConfig( + metric="two_axis", + alpha_ix=alpha, + beta_ity=beta, + **config_kwargs, + ) + return SingleMetricPruning(config, precomputed_metrics) + except (ValueError, IndexError): + logger.warning(f"Could not parse two-axis variant '{method}', falling through") if method in single_metrics: config = MetricPruningConfig(metric=method, **config_kwargs) From 50d78f7c5596acda3f905dc1dddac78ddfae8b3b Mon Sep 17 00:00:00 2001 From: Houman Safaai Date: Mon, 20 Apr 2026 01:02:33 -0400 Subject: [PATCH 4/7] add olmo configs --- configs/prune_llm/olmo2_7b_ckpt_template.yaml | 130 ++++++++++++++++ .../prune_llm/olmo2_7b_pruning_curves.yaml | 143 ++++++++++++++++++ .../olmo2_ckpts_generated/olmo2_7b_main.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step150_1B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step20000_84B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step2000_9B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step300000_1259B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step5000_21B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step600000_2517B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step600_3B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step900000_3775B.yaml | 130 ++++++++++++++++ .../olmo2_7b_s1_step99000_416B.yaml | 130 ++++++++++++++++ 12 files changed, 1573 insertions(+) create mode 100644 configs/prune_llm/olmo2_7b_ckpt_template.yaml create mode 100644 configs/prune_llm/olmo2_7b_pruning_curves.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml create mode 100644 configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml diff --git a/configs/prune_llm/olmo2_7b_ckpt_template.yaml b/configs/prune_llm/olmo2_7b_ckpt_template.yaml new file mode 100644 index 00000000..f94eabda --- /dev/null +++ b/configs/prune_llm/olmo2_7b_ckpt_template.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders @REVISION@ and @RUN_TAG@ are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_@RUN_TAG@" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_@RUN_TAG@" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "@REVISION@" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "@REVISION@" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_7b_pruning_curves.yaml b/configs/prune_llm/olmo2_7b_pruning_curves.yaml new file mode 100644 index 00000000..7217c741 --- /dev/null +++ b/configs/prune_llm/olmo2_7b_pruning_curves.yaml @@ -0,0 +1,143 @@ +# ============================================================================ +# OLMo-2-1124-7B STRUCTURED PRUNING CURVES +# ============================================================================ +# Purpose: +# - Extend the cross-model generalisation suite (Llama-3.1-8B, Mistral-7B, +# Llama-2-7B, Qwen2-7B) to a fifth SwiGLU model from a different training +# lineage (Allen AI OLMo-2), and replicate the SCAR-vs-baseline pruning gap. +# - Pair with the checkpoint-trajectory analysis so the paper reports BOTH +# that supernodes exist in OLMo-main AND that protecting them improves +# structured pruning in the same way as the existing four models. +# ============================================================================ + +experiment: + name: "olmo2_7b_pruning_curves" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_pruning_curves" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Use final (main) checkpoint for pruning. +model_kwargs: + revision: "main" + +tokenizer_kwargs: + revision: "main" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "activation_l2_norm" + num_samples: 64 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: true + evaluation_num_samples: 100 + evaluation_metrics: + - "perplexity" + perplexity_protocol: "oats" + wikitext_subset: "wikitext-2-raw-v1" + perplexity_seq_len: 2048 + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: true +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + - "supernode_protection_score" + - "supernode_connectivity_score" + cross_layer_analysis: false + compare_by_connection: true + compute_metrics: + - "activation" + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: false + outlier_analysis: false + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: true + target: "ffn" + structured: true + dependency_aware: true + distribution: "uniform" + min_per_layer: 0.0 + max_per_layer: 0.95 + + # Match the shape of the 70B curves (30/50/70) plus the 20-60 points used + # for the 8B cross-model figure; keep the list compact to stay within time. + sparsity_levels: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7] + selection_modes: ["low"] + algorithms: + - "scar_loss_proxy" + - "supernode_protection_score" + - "supernode_connectivity_score" + - "activation_l2_norm" + - "weight_magnitude" + - "wanda" + - "random" + +evaluation: + enabled: true diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml new file mode 100644 index 00000000..7e678278 --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders main and main are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_main" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_main" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "main" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "main" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml new file mode 100644 index 00000000..26d6f837 --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step150-tokens1B and s1_step150_1B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step150_1B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step150_1B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step150-tokens1B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step150-tokens1B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml new file mode 100644 index 00000000..53e3fe26 --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step20000-tokens84B and s1_step20000_84B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step20000_84B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step20000_84B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step20000-tokens84B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step20000-tokens84B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml new file mode 100644 index 00000000..d736a6ae --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step2000-tokens9B and s1_step2000_9B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step2000_9B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step2000_9B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step2000-tokens9B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step2000-tokens9B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml new file mode 100644 index 00000000..52a6cc8e --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step300000-tokens1259B and s1_step300000_1259B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step300000_1259B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step300000_1259B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step300000-tokens1259B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step300000-tokens1259B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml new file mode 100644 index 00000000..dd5a0d47 --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step5000-tokens21B and s1_step5000_21B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step5000_21B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step5000_21B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step5000-tokens21B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step5000-tokens21B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml new file mode 100644 index 00000000..b885cab1 --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step600000-tokens2517B and s1_step600000_2517B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step600000_2517B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step600000_2517B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step600000-tokens2517B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step600000-tokens2517B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml new file mode 100644 index 00000000..9b5bac79 --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step600-tokens3B and s1_step600_3B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step600_3B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step600_3B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step600-tokens3B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step600-tokens3B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml new file mode 100644 index 00000000..a5dd8c72 --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step900000-tokens3775B and s1_step900000_3775B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step900000_3775B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step900000_3775B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step900000-tokens3775B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step900000-tokens3775B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml new file mode 100644 index 00000000..c5d6bc4d --- /dev/null +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml @@ -0,0 +1,130 @@ +# ============================================================================ +# OLMo-2-7B CHECKPOINT MECHANISM PROBE (TEMPLATE) +# ============================================================================ +# Purpose: +# - Compute LP concentration for a *single* OLMo-2-1124-7B training checkpoint. +# - Identical mechanism-only setup to llama3_70b_scale_mechanism.yaml so the +# resulting JSONs are aggregable across model / checkpoint. +# - The placeholders stage1-step99000-tokens416B and s1_step99000_416B are substituted per-checkpoint +# by paper/slurm_jobs/run_olmo2_7b_checkpoint_trajectory.sh. +# ============================================================================ + +experiment: + name: "olmo2_7b_ckpt_s1_step99000_416B" + type: "llm_alignment" + output_dir: "./results/paper/olmo2_7b_ckpt_s1_step99000_416B" + seed: 42 + device: "cuda" + save_activations: false + num_networks: 1 + +model: + name: "hf_causal_lm" + model_id: "allenai/OLMo-2-1124-7B" + dtype: "bfloat16" + device_map: "auto" + tracked_layers: + - "model.model.layers.*.mlp.up_proj" + - "model.model.layers.*.mlp.gate_proj" + - "model.model.layers.*.mlp.down_proj" + +# Passed through to AutoModelForCausalLM.from_pretrained +model_kwargs: + revision: "stage1-step99000-tokens416B" + +# Passed through to AutoTokenizer.from_pretrained +tokenizer_kwargs: + revision: "stage1-step99000-tokens416B" + +dataset: + name: "wikitext" + batch_size: 1 + num_workers: 0 + +calibration: + dataset: "wikitext" + subset: "wikitext-2-raw-v1" + split: "train" + num_samples: 64 + max_length: 512 + batch_size: 1 + +metrics: + enabled: + - "rayleigh_quotient" + num_samples: 64 + rayleigh_quotient: + relative: true + regularization: 1.0e-6 + +do_scar_metrics: true +scar_num_samples: 64 +scar_max_length: 512 + +llm: + scar_metrics: true + scar_num_samples: 64 + scar_max_length: 512 + evaluate_perplexity: false + evaluation_metrics: [] + wikitext_subset: "wikitext-2-raw-v1" + +analysis: + generate_plots: false + save_scores: true + +generate_plots: false +save_scores: true + +do_connectivity_pruning: false +do_directed_redundancy: false +do_halo_analysis: false +do_generalized_importance: false + +supernode: + enabled: true + score_metric: "scar_loss_proxy" + core_fraction: 0.01 + follower_fraction: 0.10 + halo_fraction: 0.10 + connectivity_topk: 256 + connectivity_rank_normalize: false + connectivity_power: 1.0 + protect_core: true + protect_core_metrics: + - "scar_loss_proxy" + cross_layer_analysis: false + compare_by_connection: false + compute_metrics: + - "activation" + + read_halo_analysis: + enabled: false + + conditional_halo_ablation: + enabled: false + + lp_ablation_validation: + enabled: false + +supernode_robustness: + enabled: false + +supernode_summary: + enabled: true + outlier_analysis: true + +halo_analysis: + enabled: false + +cross_layer: + enabled: false + +generalized_importance: + enabled: false + +pruning: + enabled: false + +evaluation: + enabled: false From 80a60f0abea6726f53ca0292ce64dcd0a739f9bf Mon Sep 17 00:00:00 2001 From: Houman Safaai Date: Mon, 20 Apr 2026 09:21:18 -0400 Subject: [PATCH 5/7] update olmo configs --- configs/prune_llm/olmo2_7b_ckpt_template.yaml | 13 ++++++++----- configs/prune_llm/olmo2_7b_pruning_curves.yaml | 8 ++++---- .../olmo2_ckpts_generated/olmo2_7b_main.yaml | 13 ++++++++----- .../olmo2_7b_s1_step150_1B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step20000_84B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step2000_9B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step300000_1259B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step5000_21B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step600000_2517B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step600_3B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step900000_3775B.yaml | 13 ++++++++----- .../olmo2_7b_s1_step99000_416B.yaml | 13 ++++++++----- 12 files changed, 92 insertions(+), 59 deletions(-) diff --git a/configs/prune_llm/olmo2_7b_ckpt_template.yaml b/configs/prune_llm/olmo2_7b_ckpt_template.yaml index f94eabda..51adca4a 100644 --- a/configs/prune_llm/olmo2_7b_ckpt_template.yaml +++ b/configs/prune_llm/olmo2_7b_ckpt_template.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "@REVISION@" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "@REVISION@" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "@REVISION@" diff --git a/configs/prune_llm/olmo2_7b_pruning_curves.yaml b/configs/prune_llm/olmo2_7b_pruning_curves.yaml index 7217c741..01a5f290 100644 --- a/configs/prune_llm/olmo2_7b_pruning_curves.yaml +++ b/configs/prune_llm/olmo2_7b_pruning_curves.yaml @@ -24,15 +24,15 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # Final released weights (after stage2). revision goes here so the registry + # constructor (HFCausalLM) passes it to AutoModelForCausalLM.from_pretrained. + revision: "main" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Use final (main) checkpoint for pruning. -model_kwargs: - revision: "main" - tokenizer_kwargs: revision: "main" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml index 7e678278..8b669c75 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_main.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "main" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "main" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "main" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml index 26d6f837..79e054e6 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step150_1B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step150-tokens1B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step150-tokens1B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step150-tokens1B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml index 53e3fe26..e8b822a4 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step20000_84B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step20000-tokens84B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step20000-tokens84B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step20000-tokens84B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml index d736a6ae..12f75c70 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step2000_9B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step2000-tokens9B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step2000-tokens9B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step2000-tokens9B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml index 52a6cc8e..86464d4d 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step300000_1259B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step300000-tokens1259B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step300000-tokens1259B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step300000-tokens1259B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml index dd5a0d47..056d9e81 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step5000_21B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step5000-tokens21B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step5000-tokens21B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step5000-tokens21B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml index b885cab1..a5477cd9 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600000_2517B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step600000-tokens2517B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step600000-tokens2517B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step600000-tokens2517B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml index 9b5bac79..fb310a5c 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step600_3B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step600-tokens3B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step600-tokens3B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step600-tokens3B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml index a5dd8c72..4024242f 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step900000_3775B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step900000-tokens3775B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step900000-tokens3775B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step900000-tokens3775B" diff --git a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml index c5d6bc4d..285e7b43 100644 --- a/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml +++ b/configs/prune_llm/olmo2_ckpts_generated/olmo2_7b_s1_step99000_416B.yaml @@ -23,16 +23,19 @@ model: model_id: "allenai/OLMo-2-1124-7B" dtype: "bfloat16" device_map: "auto" + # NOTE: the registry constructor for hf_causal_lm (HFCausalLM in models/hub.py) + # reads `revision` directly from this block and passes it to + # AutoModelForCausalLM.from_pretrained, so checkpoint selection MUST go here + # rather than in a top-level model_kwargs: block. + revision: "stage1-step99000-tokens416B" + torch_dtype: "bfloat16" tracked_layers: - "model.model.layers.*.mlp.up_proj" - "model.model.layers.*.mlp.gate_proj" - "model.model.layers.*.mlp.down_proj" -# Passed through to AutoModelForCausalLM.from_pretrained -model_kwargs: - revision: "stage1-step99000-tokens416B" - -# Passed through to AutoTokenizer.from_pretrained +# Tokenizer does not change across OLMo-2 training checkpoints, but we pass +# the revision defensively so tokenizer and model remain paired. tokenizer_kwargs: revision: "stage1-step99000-tokens416B" From 6236387daf1e9b429984d1c8ef9bd9e4d505c804 Mon Sep 17 00:00:00 2001 From: Houman Safaai Date: Mon, 20 Apr 2026 09:28:45 -0400 Subject: [PATCH 6/7] Revert "add docs/papers.md multi-paper index" This reverts commit 76ed7d991c3e1bc86df853124618056160d8754d. --- docs/papers.md | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100644 docs/papers.md diff --git a/docs/papers.md b/docs/papers.md deleted file mode 100644 index 091dfd2a..00000000 --- a/docs/papers.md +++ /dev/null @@ -1,28 +0,0 @@ -# Papers built on this framework - -This framework underpins several research projects. Each paper has its own -companion repository with LaTeX source, paper-specific analysis scripts, and -reproducibility instructions. Paper repositories depend on a pinned version -of this framework; the recommended workflow is to clone both, checkout the -framework at the pinned ref, and follow the paper repo's `reproduce.md`. - -| Paper | Status | Paper repo | Framework ref | Year | -|-------|--------|------------|---------------|------| -| *Relevance Is Not Replaceability: Orthogonal Axes of Channel Information in Vision Networks* | manuscript companion repo | [KempnerInstitute/alignment_notes](https://github.com/KempnerInstitute/alignment_notes) | `ca438bd1419849775a08d366416486ba2c03ccdc` | 2026 | - -## Adding a new paper - -When starting a new paper that uses this framework: - -1. Create a dedicated git repo for the paper under the Kempner org (e.g. - `KempnerInstitute/`). -2. Commit the LaTeX source, figure-generation scripts, paper-specific analysis - scripts, and a `README.md`, `reproduce.md`, `CITATION.cff`, and - `pinned_commit.txt` in that repo. -3. The paper repo may live under `drafts//` inside this - framework repo's working tree — `drafts/` is gitignored here, so it won't - pollute the framework repo. Each paper repo should have its own `.gitignore`. -4. Before a public release, cut a tag in *this* framework repo (e.g. - `neurips2026--v1`) that matches the framework commit the paper - used, and record the full SHA in the paper repo's `pinned_commit.txt`. -5. Add a row to the table above. From f0c6ae5c749f04862add2774df19d8fbe0defbe5 Mon Sep 17 00:00:00 2001 From: Houman Safaai Date: Mon, 20 Apr 2026 09:29:59 -0400 Subject: [PATCH 7/7] update olmo configs --- src/alignment/configs/config_loader.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/alignment/configs/config_loader.py b/src/alignment/configs/config_loader.py index d5644e31..c7b00f6e 100644 --- a/src/alignment/configs/config_loader.py +++ b/src/alignment/configs/config_loader.py @@ -835,7 +835,18 @@ def _map_nested_to_flat_config(nested_config: Dict[str, Any]) -> Dict[str, Any]: flat_config["pretrained"] = external.get("pretrained", False) # Handle HuggingFace model config (for LLMs) - hf_fields = ["model_id", "model_backend", "dtype", "torch_dtype", "device_map"] + # `revision` and `trust_remote_code` forward through the hf_causal_lm + # registry constructor (HFCausalLM) into AutoModelForCausalLM.from_pretrained, + # so they must survive config flattening. + hf_fields = [ + "model_id", + "model_backend", + "dtype", + "torch_dtype", + "device_map", + "revision", + "trust_remote_code", + ] for field in hf_fields: if field in model: # Normalize dtype field name (prefer 'dtype', but accept 'torch_dtype')