Added posterior predictive sampling methods to BART and BCF in python

andrewherren · andrewherren · commit 4f1beed35aed · 2025-10-17T01:02:33.000-05:00
diff --git a/R/posterior_transformation.R b/R/posterior_transformation.R
@@ -1,14 +1,14 @@
 #' Sample from the posterior predictive distribution for outcomes modeled by BCF
 #'
 #' @param model_object A fitted BCF model object of class `bcfmodel`.
-#' @param covariates (Optional) A matrix or data frame of covariates at which to compute the intervals. Required if the requested term depends on covariates (e.g., prognostic forest, CATE forest, variance forest, or overall predictions).
-#' @param treatment (Optional) A vector or matrix of treatment assignments. Required if the requested term is `"y_hat"` (overall predictions).
-#' @param propensity (Optional) A vector or matrix of propensity scores. Required if the requested term is `"y_hat"` (overall predictions) and the underlying model depends on user-provided propensities.
+#' @param covariates A matrix or data frame of covariates.
+#' @param treatment A vector or matrix of treatment assignments.
+#' @param propensity (Optional) A vector or matrix of propensity scores. Required if the underlying model depends on user-provided propensities.
 #' @param rfx_group_ids (Optional) A vector of group IDs for random effects model. Required if the BCF model includes random effects.
 #' @param rfx_basis (Optional) A matrix of bases for random effects model. Required if the BCF model includes random effects.
-#' @param num_draws (Optional) The number of samples to draw from the likelihood, for each draw of the posterior, in computing intervals. Defaults to a heuristic based on the number of samples in a BCF model (i.e. if the BCF model has >1000 draws, we use 1 draw from the likelihood per sample, otherwise we upsample to ensure at least 1000 posterior predictive draws).
+#' @param num_draws_per_sample (Optional) The number of samples to draw from the likelihood for each draw of the posterior. Defaults to a heuristic based on the number of samples in a BCF model (i.e. if the BCF model has >1000 draws, we use 1 draw from the likelihood per sample, otherwise we upsample to ensure at least 1000 posterior predictive draws).
 #'
-#' @returns Array of posterior predictive samples with dimensions (num_observations, num_posterior_samples, num_draws) if num_draws > 1, otherwise (num_observations, num_posterior_samples).
+#' @returns Array of posterior predictive samples with dimensions (num_observations, num_posterior_samples, num_draws_per_sample) if num_draws_per_sample > 1, otherwise (num_observations, num_posterior_samples).
 #'
 #' @export
 #' @examples
@@ -30,9 +30,9 @@ sample_bcf_posterior_predictive <- function(
   propensity = NULL,
   rfx_group_ids = NULL,
   rfx_basis = NULL,
-  num_draws = NULL
+  num_draws_per_sample = NULL
 ) {
-  # Check the provided model object and requested term
+  # Check the provided model object
   check_model_is_valid(model_object)
 
   # Determine whether the outcome is continuous (Gaussian) or binary (probit-link)
@@ -123,7 +123,7 @@ sample_bcf_posterior_predictive <- function(
     }
   }
 
-  # Compute posterior predictive samples
+  # Compute posterior samples
   bcf_preds <- predict(
     model_object,
     X = covariates,
@@ -132,8 +132,11 @@ sample_bcf_posterior_predictive <- function(
     rfx_group_ids = rfx_group_ids,
     rfx_basis = rfx_basis,
     type = "posterior",
-    terms = c("all")
+    terms = c("all"),
+    scale = "linear"
   )
+
+  # Compute outcome mean and variance for every posterior draw
   has_rfx <- model_object$model_params$has_rfx
   has_variance_forest <- model_object$model_params$include_variance_forest
   samples_global_variance <- model_object$model_params$sample_sigma2_global
@@ -155,16 +158,20 @@ sample_bcf_posterior_predictive <- function(
       ppd_variance <- model_object$model_params$initial_sigma2
     }
   }
-  if (is.null(num_draws)) {
+
+  # Sample from the posterior predictive distribution
+  if (is.null(num_draws_per_sample)) {
     ppd_draw_multiplier <- posterior_predictive_heuristic_multiplier(
       num_posterior_draws,
       num_observations
     )
   } else {
-    ppd_draw_multiplier <- num_draws
+    ppd_draw_multiplier <- num_draws_per_sample
   }
   num_ppd_draws <- ppd_draw_multiplier * num_posterior_draws * num_observations
   ppd_vector <- rnorm(num_ppd_draws, ppd_mean, sqrt(ppd_variance))
+
+  # Reshape data
   if (ppd_draw_multiplier > 1) {
     ppd_array <- array(
       ppd_vector,
@@ -177,6 +184,7 @@ sample_bcf_posterior_predictive <- function(
     )
   }
 
+  # Binarize outcomes for probit models
   if (is_probit) {
     ppd_array <- (ppd_array > 0.0) * 1
   }
@@ -187,13 +195,13 @@ sample_bcf_posterior_predictive <- function(
 #' Sample from the posterior predictive distribution for outcomes modeled by BART
 #'
 #' @param model_object A fitted BART model object of class `bartmodel`.
-#' @param covariates A matrix or data frame of covariates at which to compute the intervals. Required if the BART model depends on covariates (e.g., contains a mean or variance forest).
+#' @param covariates A matrix or data frame of covariates. Required if the BART model depends on covariates (e.g., contains a mean or variance forest).
 #' @param basis A matrix of bases for mean forest models with regression defined in the leaves. Required for "leaf regression" models.
 #' @param rfx_group_ids A vector of group IDs for random effects model. Required if the BART model includes random effects.
 #' @param rfx_basis A matrix of bases for random effects model. Required if the BART model includes random effects.
-#' @param num_draws The number of posterior predictive samples to draw in computing intervals. Defaults to a heuristic based on the number of samples in a BART model (i.e. if the BART model has >1000 draws, we use 1 draw from the likelihood per sample, otherwise we upsample to ensure intervals are based on at least 1000 posterior predictive draws).
+#' @param num_draws_per_sample The number of posterior predictive samples to draw for each posterior sample. Defaults to a heuristic based on the number of samples in a BART model (i.e. if the BART model has >1000 draws, we use 1 draw from the likelihood per sample, otherwise we upsample to ensure intervals are based on at least 1000 posterior predictive draws).
 #'
-#' @returns Array of posterior predictive samples with dimensions (num_observations, num_posterior_samples, num_draws) if num_draws > 1, otherwise (num_observations, num_posterior_samples).
+#' @returns Array of posterior predictive samples with dimensions (num_observations, num_posterior_samples, num_draws_per_sample) if num_draws_per_sample > 1, otherwise (num_observations, num_posterior_samples).
 #'
 #' @export
 #' @examples
@@ -211,9 +219,9 @@ sample_bart_posterior_predictive <- function(
   basis = NULL,
   rfx_group_ids = NULL,
   rfx_basis = NULL,
-  num_draws = NULL
+  num_draws_per_sample = NULL
 ) {
-  # Check the provided model object and requested term
+  # Check the provided model object
   check_model_is_valid(model_object)
 
   # Determine whether the outcome is continuous (Gaussian) or binary (probit-link)
@@ -276,16 +284,19 @@ sample_bart_posterior_predictive <- function(
     }
   }
 
-  # Compute posterior predictive samples
+  # Compute posterior samples
   bart_preds <- predict(
     model_object,
     covariates = covariates,
     leaf_basis = basis,
     rfx_group_ids = rfx_group_ids,
     rfx_basis = rfx_basis,
     type = "posterior",
-    terms = c("all")
+    terms = c("all"),
+    scale = "linear"
   )
+
+  # Compute outcome mean and variance for every posterior draw
   has_mean_term <- (model_object$model_params$include_mean_forest ||
     model_object$model_params$has_rfx)
   has_variance_forest <- model_object$model_params$include_variance_forest
@@ -312,16 +323,20 @@ sample_bart_posterior_predictive <- function(
       ppd_variance <- model_object$model_params$sigma2_init
     }
   }
-  if (is.null(num_draws)) {
+
+  # Sample from the posterior predictive distribution
+  if (is.null(num_draws_per_sample)) {
     ppd_draw_multiplier <- posterior_predictive_heuristic_multiplier(
       num_posterior_draws,
       num_observations
     )
   } else {
-    ppd_draw_multiplier <- num_draws
+    ppd_draw_multiplier <- num_draws_per_sample
   }
   num_ppd_draws <- ppd_draw_multiplier * num_posterior_draws * num_observations
   ppd_vector <- rnorm(num_ppd_draws, ppd_mean, sqrt(ppd_variance))
+
+  # Reshape data
   if (ppd_draw_multiplier > 1) {
     ppd_array <- array(
       ppd_vector,
@@ -334,6 +349,7 @@ sample_bart_posterior_predictive <- function(
     )
   }
 
+  # Binarize outcomes for probit models
   if (is_probit) {
     ppd_array <- (ppd_array > 0.0) * 1
   }
diff --git a/demo/debug/bart_predict_debug.py b/demo/debug/bart_predict_debug.py
@@ -80,3 +80,25 @@
     (intervals["y_hat"]["lower"] <= f_X_test) & (f_X_test <= intervals["y_hat"]["upper"])
 )
 print(f"Coverage of 95% posterior interval for f(X): {mean_coverage:.3f}")
+
+# Sample from the posterior predictive distribution
+bart_ppd_samples = bart_model.sample_posterior_predictive(
+    covariates = X_test, num_draws_per_sample = 10
+)
+
+# Plot PPD mean vs actual
+ppd_mean = np.mean(bart_ppd_samples, axis=(0, 2))
+plt.clf()
+plt.scatter(ppd_mean, y_test, color="blue")
+plt.axline((0, 0), slope=1, color="red", linestyle=(0, (3,3)))
+plt.xlabel("Predicted")
+plt.ylabel("Actual")
+plt.title("Posterior Predictive Mean Comparison")
+plt.show()
+
+# Check coverage of posterior predictive distribution
+ppd_intervals = np.percentile(bart_ppd_samples, [2.5, 97.5], axis=(0, 2))
+ppd_coverage = np.mean(
+    (ppd_intervals[0, :] <= y_test) & (y_test <= ppd_intervals[1, :])
+)
+print(f"Coverage of 95% posterior predictive interval for Y: {ppd_coverage:.3f}")
diff --git a/demo/debug/bcf_predict_debug.py b/demo/debug/bcf_predict_debug.py
@@ -116,3 +116,25 @@
     (intervals["mu_hat"]["lower"] <= mu_test) & (mu_test <= intervals["mu_hat"]["upper"])
 )
 print(f"Coverage of 95% posterior interval for mu(X): {mu_coverage:.3f}")
+
+# Sample from the posterior predictive distribution
+bcf_ppd_samples = bcf_model.sample_posterior_predictive(
+    covariates = X_test, treatment = Z_test, propensity = pi_test, num_draws_per_sample = 10
+)
+
+# Plot PPD mean vs actual
+ppd_mean = np.mean(bcf_ppd_samples, axis=(0, 2))
+plt.clf()
+plt.scatter(ppd_mean, y_test, color="blue")
+plt.axline((0, 0), slope=1, color="red", linestyle=(0, (3,3)))
+plt.xlabel("Predicted")
+plt.ylabel("Actual")
+plt.title("Posterior Predictive Mean Comparison")
+plt.show()
+
+# Check coverage of posterior predictive distribution
+ppd_intervals = np.percentile(bcf_ppd_samples, [2.5, 97.5], axis=(0, 2))
+ppd_coverage = np.mean(
+    (ppd_intervals[0, :] <= y_test) & (y_test <= ppd_intervals[1, :])
+)
+print(f"Coverage of 95% posterior predictive interval for Y: {ppd_coverage:.3f}")
diff --git a/stochtree/bart.py b/stochtree/bart.py
@@ -1884,10 +1884,12 @@ def compute_posterior_interval(self, terms: Union[list[str], str] = "all", scale
         dict
             A dict containing the lower and upper bounds of the credible interval for the specified term. If multiple terms are requested, a dict with intervals for each term is returned.
         """
-        # Check the provided model object and requested term
-        self.is_sampled()
+        # Check the provided model object and requested terms
+        if not self.is_sampled():
+            raise ValueError("Model has not yet been sampled")
         for term in terms:
-            self.has_term(term)
+            if not self.has_term(term):
+                warnings.warn(f"Term {term} was not sampled in this model and its intervals will not be returned.")
 
         # Handle mean function scale
         if not isinstance(scale, str):
@@ -1966,6 +1968,134 @@ def compute_posterior_interval(self, terms: Union[list[str], str] = "all", scale
                     predictions, 1, level=level
                 )
     
+    def sample_posterior_predictive(self, covariates: np.array = None, basis: np.array = None, rfx_group_ids: np.array = None, rfx_basis: np.array = None, num_draws_per_sample: int = None) -> np.array:
+        """
+        Sample from the posterior predictive distribution for outcomes modeled by BART
+
+        Parameters
+        ----------
+        covariates : np.array, optional
+            An array or data frame of covariates at which to compute the intervals. Required if the BART model depends on covariates (e.g., contains a mean or variance forest).
+        basis : np.array, optional
+            An array of basis function evaluations for mean forest models with regression defined in the leaves. Required for "leaf regression" models.
+        rfx_group_ids : np.array, optional
+            An array of group IDs for random effects. Required if the BART model includes random effects.
+        rfx_basis : np.array, optional
+            An array of basis function evaluations for random effects. Required if the BART model includes random effects.
+        num_draws_per_sample : int, optional
+            The number of posterior predictive samples to draw for each posterior sample. Defaults to a heuristic based on the number of samples in a BART model (i.e. if the BART model has >1000 draws, we use 1 draw from the likelihood per sample, otherwise we upsample to ensure intervals are based on at least 1000 posterior predictive draws).
+        
+        Returns
+        -------
+        np.array
+            A matrix of posterior predictive samples. If `num_draws = 1`.
+        """
+        # Check the provided model object
+        if not self.is_sampled():
+            raise ValueError("Model has not yet been sampled")
+
+        # Determine whether the outcome is continuous (Gaussian) or binary (probit-link)
+        is_probit = self.probit_outcome_model
+
+        # Check that all the necessary inputs were provided for interval computation
+        needs_covariates = self.include_mean_forest
+        if needs_covariates:
+            if covariates is None:
+                raise ValueError(
+                    "'covariates' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(covariates, np.ndarray) and not isinstance(
+                covariates, pd.DataFrame
+            ):
+                raise ValueError("'covariates' must be a matrix or data frame")
+        needs_basis = needs_covariates and self.has_basis
+        if needs_basis:
+            if basis is None:
+                raise ValueError(
+                    "'basis' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(basis, np.ndarray):
+                raise ValueError("'basis' must be a numpy array")
+            if basis.shape[0] != covariates.shape[0]:
+                raise ValueError(
+                    "'basis' must have the same number of rows as 'covariates'"
+                )
+        needs_rfx_data = self.has_rfx
+        if needs_rfx_data:
+            if rfx_group_ids is None:
+                raise ValueError(
+                    "'rfx_group_ids' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(rfx_group_ids, np.ndarray):
+                raise ValueError("'rfx_group_ids' must be a numpy array")
+            if rfx_group_ids.shape[0] != covariates.shape[0]:
+                raise ValueError(
+                    "'rfx_group_ids' must have the same length as the number of rows in 'covariates'"
+                )
+            if rfx_basis is None:
+                raise ValueError(
+                    "'rfx_basis' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(rfx_basis, np.ndarray):
+                raise ValueError("'rfx_basis' must be a numpy array")
+            if rfx_basis.shape[0] != covariates.shape[0]:
+                raise ValueError(
+                    "'rfx_basis' must have the same number of rows as 'covariates'"
+                )
+
+        # Compute posterior predictive samples
+        bart_preds = self.predict(covariates=covariates, basis=basis, rfx_group_ids=rfx_group_ids, rfx_basis=rfx_basis, type="posterior", terms="all")
+
+        # Compute outcome mean and variance for posterior predictive distribution
+        has_mean_term = (self.include_mean_forest or self.has_rfx)
+        has_variance_forest = self.include_variance_forest
+        samples_global_variance = self.sample_sigma2_global
+        num_posterior_draws = self.num_samples
+        num_observations = covariates.shape[0]
+        if has_mean_term:
+            ppd_mean = bart_preds["y_hat"]
+        else:
+            ppd_mean = 0.
+        if has_variance_forest:
+            ppd_variance = bart_preds["variance_forest_predictions"]
+        else:
+            if samples_global_variance:
+                ppd_variance = np.tile(
+                    self.global_var_samples,
+                    (num_observations, 1)
+                )
+            else:
+                ppd_variance = self.sigma2_init
+        
+        # Sample from the posterior predictive distribution
+        if num_draws_per_sample is None:
+            ppd_draw_multiplier = _posterior_predictive_heuristic_multiplier(
+                num_posterior_draws,
+                num_observations
+            )
+        else:
+            ppd_draw_multiplier = num_draws_per_sample
+        if ppd_draw_multiplier > 1:
+            ppd_mean = np.tile(ppd_mean, (ppd_draw_multiplier, 1, 1))
+            ppd_variance = np.tile(ppd_variance, (ppd_draw_multiplier, 1, 1))
+            ppd_array = np.random.normal(
+                loc = ppd_mean,
+                scale = np.sqrt(ppd_variance), 
+                size = (ppd_draw_multiplier, num_observations, num_posterior_draws)
+            )
+        else:
+            ppd_array = np.random.normal(
+                loc = ppd_mean,
+                scale = np.sqrt(ppd_variance), 
+                size = (num_observations, num_posterior_draws)
+            )
+    
+        # Binarize outcome for probit models
+        if is_probit:
+            ppd_array = (ppd_array > 0.0) * 1
+        
+        return ppd_array
+    
     def to_json(self) -> str:
         """
         Converts a sampled BART model to JSON string representation (which can then be saved to a file or
diff --git a/stochtree/bcf.py b/stochtree/bcf.py