StochasticTree
diff --git a/‎R/posterior_transformation.R‎
Lines changed: 2 additions & 6 deletions b/‎R/posterior_transformation.R‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎demo/debug/bart_predict_debug.py‎
Lines changed: 16 additions & 2 deletions b/‎demo/debug/bart_predict_debug.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎demo/debug/bcf_predict_debug.py‎
Lines changed: 118 additions & 0 deletions b/‎demo/debug/bcf_predict_debug.py‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎stochtree/bart.py‎
Lines changed: 137 additions & 4 deletions b/‎stochtree/bart.py‎
Lines changed: 137 additions & 4 deletions
@@ -361,7 +361,7 @@ posterior_predictive_heuristic_multiplier <- function(
 #' @param scale (Optional) Scale of mean function predictions. Options are "linear", which returns predictions on the original scale of the mean forest / RFX terms, and "probability", which transforms predictions into a probability of observing `y == 1`. "probability" is only valid for models fit with a probit outcome model. Default: "linear".
 #' @param covariates (Optional) A matrix or data frame of covariates at which to compute the intervals. Required if the requested term depends on covariates (e.g., prognostic forest, CATE forest, variance forest, or overall predictions).
 #' @param treatment (Optional) A vector or matrix of treatment assignments. Required if the requested term is `"y_hat"` (overall predictions).
-#' @param propensity (Optional) A vector or matrix of propensity scores. Required if the requested term is `"y_hat"` (overall predictions) and the underlying model depends on user-provided propensities.
+#' @param propensity (Optional) A vector or matrix of propensity scores. Required if the underlying model depends on user-provided propensities.
 #' @param rfx_group_ids An optional vector of group IDs for random effects. Required if the requested term includes random effects.
 #' @param rfx_basis An optional matrix of basis function evaluations for random effects. Required if the requested term includes random effects.
 #'
@@ -417,7 +417,6 @@ compute_bcf_posterior_interval <- function(
       "scale cannot be 'probability' for models not fit with a probit outcome model"
     )
   }
-  probability_scale <- scale == "probability"
 
   # Check that all the necessary inputs were provided for interval computation
   needs_covariates_intermediate <- ((("y_hat" %in% terms) ||
@@ -547,9 +546,7 @@ compute_bcf_posterior_interval <- function(
   }
 }
 
-#' Compute posterior credible intervals for BART model terms
-#'
-#' This function computes posterior credible intervals for specified terms from a fitted BART model. It supports intervals for mean functions, variance functions, random effects, and overall predictions.
+#' Compute posterior credible intervals for specified terms from a fitted BART model. It supports intervals for mean functions, variance functions, random effects, and overall predictions.
 #' @param model_object A fitted BART or BCF model object of class `bartmodel`.
 #' @param terms A character string specifying the model term(s) for which to compute intervals. Options for BART models are `"mean_forest"`, `"variance_forest"`, `"rfx"`, or `"y_hat"`.
 #' @param level A numeric value between 0 and 1 specifying the credible interval level (default is 0.95 for a 95% credible interval).
@@ -604,7 +601,6 @@ compute_bart_posterior_interval <- function(
       "scale cannot be 'probability' for models not fit with a probit outcome model"
     )
   }
-  probability_scale <- scale == "probability"
 
   # Check that all the necessary inputs were provided for interval computation
   needs_covariates_intermediate <- ((("y_hat" %in% terms) ||
 
@@ -8,7 +8,7 @@
 
 # Generate data
 rng = np.random.default_rng()
-n = 100
+n = 500
 p = 5
 X = rng.uniform(low=0.0, high=1.0, size=(n, p))
 f_X = np.where(
@@ -42,7 +42,7 @@
     X_test=X_test,
     num_gfr=10,
     num_burnin=0,
-    num_mcmc=10,
+    num_mcmc=1000,
 )
 
 # # Check several predict approaches
@@ -66,3 +66,17 @@
 plt.ylabel("Actual")
 plt.title("Y hat")
 plt.show()
+
+# Compute posterior interval
+intervals = bart_model.compute_posterior_interval(
+    terms = "all", 
+    scale = "linear", 
+    level = 0.95, 
+    covariates = X_test
+)
+
+# Check coverage
+mean_coverage = np.mean(
+    (intervals["y_hat"]["lower"] <= f_X_test) & (f_X_test <= intervals["y_hat"]["upper"])
+)
+print(f"Coverage of 95% posterior interval for f(X): {mean_coverage:.3f}")
@@ -0,0 +1,118 @@
+# Demo of updated predict method for BART
+
+# Load library
+from stochtree import BCFModel
+import numpy as np
+from sklearn.model_selection import train_test_split
+from scipy.stats import norm
+import matplotlib.pyplot as plt
+
+# Generate data
+rng = np.random.default_rng()
+n = 1000
+p = 5
+X = rng.normal(loc=0.0, scale=1.0, size=(n, p))
+mu_X = X[:,0]
+tau_X = 0.25 * X[:,1]
+pi_X = norm.cdf(0.5 * X[:,1])
+Z = rng.binomial(n=1, p=pi_X, size=(n,))
+E_XZ = mu_X + tau_X * Z
+snr = 2.0
+noise_sd = np.std(E_XZ) / snr
+y = E_XZ + rng.normal(loc=0.0, scale=noise_sd, size=(n,))
+
+# Train-test split
+sample_inds = np.arange(n)
+test_set_pct = 0.2
+train_inds, test_inds = train_test_split(sample_inds, test_size=test_set_pct)
+X_train = X[train_inds, :]
+X_test = X[test_inds, :]
+Z_train = Z[train_inds]
+Z_test = Z[test_inds]
+pi_train = pi_X[train_inds]
+pi_test = pi_X[test_inds]
+tau_train = tau_X[train_inds]
+tau_test = tau_X[test_inds]
+mu_train = mu_X[train_inds]
+mu_test = mu_X[test_inds]
+y_train = y[train_inds]
+y_test = y[test_inds]
+E_XZ_train = E_XZ[train_inds]
+E_XZ_test = E_XZ[test_inds]
+
+# Fit simple BCF model
+bcf_model = BCFModel()
+bcf_model.sample(
+    X_train=X_train,
+    Z_train=Z_train,
+    pi_train=pi_train,
+    y_train=y_train,
+    num_gfr=10,
+    num_burnin=0,
+    num_mcmc=1000,
+)
+
+# Check several predict approaches
+bcf_preds = bcf_model.predict(X=X_test, Z=Z_test, propensity=pi_test)
+y_hat_posterior_test = bcf_model.predict(X=X_test, Z=Z_test, propensity=pi_test)['y_hat']
+y_hat_mean_test = bcf_model.predict(
+    X=X_test, Z=Z_test, propensity=pi_test,
+    type = "mean",
+    terms = ["y_hat"]
+)
+tau_hat_mean_test = bcf_model.predict(
+    X=X_test, Z=Z_test, propensity=pi_test,
+    type = "mean",
+    terms = ["cate"]
+)
+# Check that this raises a warning
+y_hat_test = bcf_model.predict(
+    X=X_test, Z=Z_test, propensity=pi_test,
+    type = "mean",
+    terms = ["rfx", "variance"]
+)
+
+# Plot predicted versus actual
+plt.scatter(y_hat_mean_test, y_test, color="black")
+plt.axline((0, 0), slope=1, color="red", linestyle=(0, (3,3)))
+plt.xlabel("Predicted")
+plt.ylabel("Actual")
+plt.title("Y hat")
+plt.show()
+
+# Plot predicted versus actual
+plt.clf()
+plt.scatter(tau_hat_mean_test, tau_test, color="black")
+plt.axline((0, 0), slope=1, color="red", linestyle=(0, (3,3)))
+plt.xlabel("Predicted")
+plt.ylabel("Actual")
+plt.title("CATE function")
+plt.show()
+
+# Compute posterior interval
+intervals = bcf_model.compute_posterior_interval(
+    terms = "all", 
+    scale = "linear", 
+    level = 0.95, 
+    covariates = X_test, 
+    treatment = Z_test,
+    propensity = pi_test
+)
+
+# Check coverage of E[Y | X, Z]
+mean_coverage = np.mean(
+    (intervals["y_hat"]["lower"] <= E_XZ_test) & (E_XZ_test <= intervals["y_hat"]["upper"])
+)
+print(f"Coverage of 95% posterior interval for E[Y|X,Z]: {mean_coverage:.3f}")
+
+# Check coverage of tau(X)
+tau_coverage = np.mean(
+    (intervals["tau_hat"]["lower"] <= tau_test) & (tau_test <= intervals["tau_hat"]["upper"])
+)
+print(f"Coverage of 95% posterior interval for tau(X): {tau_coverage:.3f}")
+
+# Check coverage of mu(X)
+mu_coverage = np.mean(
+    (intervals["mu_hat"]["lower"] <= mu_test) & (mu_test <= intervals["mu_hat"]["upper"])
+)
+print(f"Coverage of 95% posterior interval for mu(X): {mu_coverage:.3f}")
@@ -1,7 +1,3 @@
-"""
-Bayesian Additive Regression Trees (BART) module
-"""
-
 import warnings
 from math import log
 from numbers import Integral
@@ -28,6 +24,8 @@
     _expand_dims_1d,
     _expand_dims_2d,
     _expand_dims_2d_diag,
+    _posterior_predictive_heuristic_multiplier, 
+    _summarize_interval
 )
 
 
@@ -1860,6 +1858,114 @@ def predict(
                 result["variance_forest_predictions"] = None
             return result
 
+    def compute_posterior_interval(self, terms: Union[list[str], str] = "all", scale: str = "linear", level: float = 0.95, covariates: np.array = None, basis: np.array = None, rfx_group_ids: np.array = None, rfx_basis: np.array = None) -> dict:
+        """
+        Compute posterior credible intervals for specified terms from a fitted BART model. It supports intervals for mean functions, variance functions, random effects, and overall predictions.
+
+        Parameters
+        ----------
+        terms : str, optional
+            Character string specifying the model term(s) for which to compute intervals. Options for BART models are `"mean_forest"`, `"variance_forest"`, `"rfx"`, `"y_hat"`, or `"all"`. Defaults to `"all"`.
+        scale : str, optional
+            Scale of mean function predictions. Options are "linear", which returns predictions on the original scale of the mean forest / RFX terms, and "probability", which transforms predictions into a probability of observing `y == 1`. "probability" is only valid for models fit with a probit outcome model. Defaults to `"linear"`.
+        level : float, optional
+            A numeric value between 0 and 1 specifying the credible interval level. Defaults to 0.95 for a 95% credible interval.
+        covariates : np.array, optional
+            Optional array or data frame of covariates at which to compute the intervals. Required if the requested term depends on covariates (e.g., mean forest, variance forest, or overall predictions).
+        basis : np.array, optional
+            Optional array of basis function evaluations for mean forest models with regression defined in the leaves. Required for "leaf regression" models.
+        rfx_group_ids : np.array, optional
+            Optional vector of group IDs for random effects. Required if the requested term includes random effects.
+        rfx_basis : np.array, optional
+            Optional matrix of basis function evaluations for random effects. Required if the requested term includes random effects.
+
+        Returns
+        -------
+        dict
+            A dict containing the lower and upper bounds of the credible interval for the specified term. If multiple terms are requested, a dict with intervals for each term is returned.
+        """
+        # Check the provided model object and requested term
+        self.is_sampled()
+        for term in terms:
+            self.has_term(term)
+
+        # Handle mean function scale
+        if not isinstance(scale, str):
+            raise ValueError("scale must be a string")
+        if scale not in ["linear", "probability"]:
+            raise ValueError("scale must either be 'linear' or 'probability'")
+        is_probit = self.probit_outcome_model
+        if (scale == "probability") and (not is_probit):
+            raise ValueError(
+                "scale cannot be 'probability' for models not fit with a probit outcome model"
+            )
+
+        # Check that all the necessary inputs were provided for interval computation
+        needs_covariates_intermediate = (("y_hat" in terms) or ("all" in terms)) and self.include_mean_forest
+        needs_covariates = ("mean_forest" in terms) or ("variance_forest" in terms) or needs_covariates_intermediate
+        if needs_covariates:
+            if covariates is None:
+                raise ValueError(
+                    "'covariates' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(covariates, np.ndarray) and not isinstance(
+                covariates, pd.DataFrame
+            ):
+                raise ValueError("'covariates' must be a matrix or data frame")
+        needs_basis = needs_covariates and self.has_basis
+        if needs_basis:
+            if basis is None:
+                raise ValueError(
+                    "'basis' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(basis, np.ndarray):
+                raise ValueError("'basis' must be a numpy array")
+            if basis.shape[0] != covariates.shape[0]:
+                raise ValueError(
+                    "'basis' must have the same number of rows as 'covariates'"
+                )
+        needs_rfx_data_intermediate = (("y_hat" in terms) or ("all" in terms)) and self.has_rfx
+        needs_rfx_data = ("rfx" in terms) or needs_rfx_data_intermediate
+        if needs_rfx_data:
+            if rfx_group_ids is None:
+                raise ValueError(
+                    "'rfx_group_ids' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(rfx_group_ids, np.ndarray):
+                raise ValueError("'rfx_group_ids' must be a numpy array")
+            if rfx_group_ids.shape[0] != covariates.shape[0]:
+                raise ValueError(
+                    "'rfx_group_ids' must have the same length as the number of rows in 'covariates'"
+                )
+            if rfx_basis is None:
+                raise ValueError(
+                    "'rfx_basis' must be provided in order to compute the requested intervals"
+                )
+            if not isinstance(rfx_basis, np.ndarray):
+                raise ValueError("'rfx_basis' must be a numpy array")
+            if rfx_basis.shape[0] != covariates.shape[0]:
+                raise ValueError(
+                    "'rfx_basis' must have the same number of rows as 'covariates'"
+                )
+
+        # Compute posterior matrices for the requested model terms
+        predictions = self.predict(covariates=covariates, basis=basis, rfx_group_ids=rfx_group_ids, rfx_basis=rfx_basis, type="posterior", terms=terms, scale=scale)
+        has_multiple_terms = True if isinstance(predictions, dict) else False
+
+        # Compute posterior intervals
+        if has_multiple_terms:
+            result = dict()
+            for term in predictions.keys():
+                if predictions[term] is not None:
+                    result[term] = _summarize_interval(
+                        predictions[term], 1, level=level
+                    )
+            return result
+        else:
+            return _summarize_interval(
+                    predictions, 1, level=level
+                )
+    
     def to_json(self) -> str:
         """
         Converts a sampled BART model to JSON string representation (which can then be saved to a file or
@@ -2145,3 +2251,30 @@ def is_sampled(self) -> bool:
             `True` if a BART model has been sampled, `False` otherwise
         """
         return self.sampled
+    
+    def has_term(self, term: str) -> bool:
+        """
+        Whether or not a model includes a term.
+
+        Parameters
+        ----------
+        term : str
+            Character string specifying the model term to check for. Options for BART models are `"mean_forest"`, `"variance_forest"`, `"rfx"`, `"y_hat"`, or `"all"`.
+        
+        Returns
+        -------
+        bool
+            `True` if the model includes the specified term, `False` otherwise
+        """
+        if term == "mean_forest":
+            return self.include_mean_forest
+        elif term == "variance_forest":
+            return self.include_variance_forest
+        elif term == "rfx":
+            return self.has_rfx
+        elif term == "y_hat":
+            return self.include_mean_forest or self.has_rfx
+        elif term == "all":
+            return True
+        else:
+            return False
Original file line number	Diff line number	Diff line change
`@@ -361,7 +361,7 @@ posterior_predictive_heuristic_multiplier <- function(`
`361`	`361`	#' @param scale (Optional) Scale of mean function predictions. Options are "linear", which returns predictions on the original scale of the mean forest / RFX terms, and "probability", which transforms predictions into a probability of observing `y == 1`. "probability" is only valid for models fit with a probit outcome model. Default: "linear".
`362`	`362`	`#' @param covariates (Optional) A matrix or data frame of covariates at which to compute the intervals. Required if the requested term depends on covariates (e.g., prognostic forest, CATE forest, variance forest, or overall predictions).`
`363`	`363`	#' @param treatment (Optional) A vector or matrix of treatment assignments. Required if the requested term is `"y_hat"` (overall predictions).
`364`		-#' @param propensity (Optional) A vector or matrix of propensity scores. Required if the requested term is `"y_hat"` (overall predictions) and the underlying model depends on user-provided propensities.
	`364`	`+#' @param propensity (Optional) A vector or matrix of propensity scores. Required if the underlying model depends on user-provided propensities.`
`365`	`365`	`#' @param rfx_group_ids An optional vector of group IDs for random effects. Required if the requested term includes random effects.`
`366`	`366`	`#' @param rfx_basis An optional matrix of basis function evaluations for random effects. Required if the requested term includes random effects.`
`367`	`367`	`#'`
`@@ -417,7 +417,6 @@ compute_bcf_posterior_interval <- function(`
`417`	`417`	`"scale cannot be 'probability' for models not fit with a probit outcome model"`
`418`	`418`	`)`
`419`	`419`	`}`
`420`		`- probability_scale <- scale == "probability"`
`421`	`420`
`422`	`421`	`# Check that all the necessary inputs were provided for interval computation`
`423`	`422`	`needs_covariates_intermediate <- ((("y_hat" %in% terms) \|\|`
`@@ -547,9 +546,7 @@ compute_bcf_posterior_interval <- function(`
`547`	`546`	`}`
`548`	`547`	`}`
`549`	`548`
`550`		`-#' Compute posterior credible intervals for BART model terms`
`551`		`-#'`
`552`		`-#' This function computes posterior credible intervals for specified terms from a fitted BART model. It supports intervals for mean functions, variance functions, random effects, and overall predictions.`
	`549`	`+#' Compute posterior credible intervals for specified terms from a fitted BART model. It supports intervals for mean functions, variance functions, random effects, and overall predictions.`
`553`	`550`	#' @param model_object A fitted BART or BCF model object of class `bartmodel`.
`554`	`551`	#' @param terms A character string specifying the model term(s) for which to compute intervals. Options for BART models are `"mean_forest"`, `"variance_forest"`, `"rfx"`, or `"y_hat"`.
`555`	`552`	`#' @param level A numeric value between 0 and 1 specifying the credible interval level (default is 0.95 for a 95% credible interval).`
`@@ -604,7 +601,6 @@ compute_bart_posterior_interval <- function(`
`604`	`601`	`"scale cannot be 'probability' for models not fit with a probit outcome model"`
`605`	`602`	`)`
`606`	`603`	`}`
`607`		`- probability_scale <- scale == "probability"`
`608`	`604`
`609`	`605`	`# Check that all the necessary inputs were provided for interval computation`
`610`	`606`	`needs_covariates_intermediate <- ((("y_hat" %in% terms) \|\|`