Fixed bug at the interface of probit / adaptive coding / RFX in Python

andrewherren · andrewherren · commit 6ee1a9b8df25 · 2025-10-22T14:46:23.000-05:00
diff --git a/stochtree/bart.py b/stochtree/bart.py
@@ -1233,9 +1233,12 @@ def sample(
                 if self.include_mean_forest:
                     if self.probit_outcome_model:
                         # Sample latent probit variable z | -
-                        forest_pred = active_forest_mean.predict(forest_dataset_train)
-                        mu0 = forest_pred[y_train[:, 0] == 0]
-                        mu1 = forest_pred[y_train[:, 0] == 1]
+                        outcome_pred = active_forest_mean.predict(forest_dataset_train)
+                        if self.has_rfx:
+                            rfx_pred = rfx_model.predict(rfx_dataset_train, rfx_tracker)
+                            outcome_pred = outcome_pred + rfx_pred
+                        mu0 = outcome_pred[y_train[:, 0] == 0]
+                        mu1 = outcome_pred[y_train[:, 0] == 1]
                         n0 = np.sum(y_train[:, 0] == 0)
                         n1 = np.sum(y_train[:, 0] == 1)
                         u0 = self.rng.uniform(
@@ -1252,7 +1255,7 @@ def sample(
                         resid_train[y_train[:, 0] == 1, 0] = mu1 + norm.ppf(u1)
 
                         # Update outcome
-                        new_outcome = np.squeeze(resid_train) - forest_pred
+                        new_outcome = np.squeeze(resid_train) - outcome_pred
                         residual_train.update_data(new_outcome)
 
                     # Sample the mean forest
@@ -1437,11 +1440,14 @@ def sample(
                     if self.include_mean_forest:
                         if self.probit_outcome_model:
                             # Sample latent probit variable z | -
-                            forest_pred = active_forest_mean.predict(
+                            outcome_pred = active_forest_mean.predict(
                                 forest_dataset_train
                             )
-                            mu0 = forest_pred[y_train[:, 0] == 0]
-                            mu1 = forest_pred[y_train[:, 0] == 1]
+                            if self.has_rfx:
+                                rfx_pred = rfx_model.predict(rfx_dataset_train, rfx_tracker)
+                                outcome_pred = outcome_pred + rfx_pred
+                            mu0 = outcome_pred[y_train[:, 0] == 0]
+                            mu1 = outcome_pred[y_train[:, 0] == 1]
                             n0 = np.sum(y_train[:, 0] == 0)
                             n1 = np.sum(y_train[:, 0] == 1)
                             u0 = self.rng.uniform(
@@ -1458,7 +1464,7 @@ def sample(
                             resid_train[y_train[:, 0] == 1, 0] = mu1 + norm.ppf(u1)
 
                             # Update outcome
-                            new_outcome = np.squeeze(resid_train) - forest_pred
+                            new_outcome = np.squeeze(resid_train) - outcome_pred
                             residual_train.update_data(new_outcome)
 
                         # Sample the mean forest
@@ -1813,15 +1819,15 @@ def predict(
         # Combine into y hat predictions
         if probability_scale:
             if predict_y_hat and has_mean_forest and has_rfx:
-                y_hat = norm.ppf(mean_forest_predictions + rfx_predictions)
-                mean_forest_predictions = norm.ppf(mean_forest_predictions)
-                rfx_predictions = norm.ppf(rfx_predictions)
+                y_hat = norm.cdf(mean_forest_predictions + rfx_predictions)
+                mean_forest_predictions = norm.cdf(mean_forest_predictions)
+                rfx_predictions = norm.cdf(rfx_predictions)
             elif predict_y_hat and has_mean_forest:
-                y_hat = norm.ppf(mean_forest_predictions)
-                mean_forest_predictions = norm.ppf(mean_forest_predictions)
+                y_hat = norm.cdf(mean_forest_predictions)
+                mean_forest_predictions = norm.cdf(mean_forest_predictions)
             elif predict_y_hat and has_rfx:
-                y_hat = norm.ppf(rfx_predictions)
-                rfx_predictions = norm.ppf(rfx_predictions)
+                y_hat = norm.cdf(rfx_predictions)
+                rfx_predictions = norm.cdf(rfx_predictions)
         else:
             if predict_y_hat and has_mean_forest and has_rfx:
                 y_hat = mean_forest_predictions + rfx_predictions
@@ -2006,8 +2012,8 @@ def compute_contrast(
 
         # Transform to probability scale if requested
         if probability_scale:
-            treatment_preds = norm.ppf(treatment_preds)
-            control_preds = norm.ppf(control_preds)
+            treatment_preds = norm.cdf(treatment_preds)
+            control_preds = norm.cdf(control_preds)
 
         # Compute and return contrast
         if predict_mean:
diff --git a/stochtree/bcf.py b/stochtree/bcf.py
@@ -1735,9 +1735,12 @@ def sample(
                     # Sample latent probit variable z | -
                     forest_pred_mu = active_forest_mu.predict(forest_dataset_train)
                     forest_pred_tau = active_forest_tau.predict(forest_dataset_train)
-                    forest_pred = forest_pred_mu + forest_pred_tau
-                    mu0 = forest_pred[y_train[:, 0] == 0]
-                    mu1 = forest_pred[y_train[:, 0] == 1]
+                    outcome_pred = forest_pred_mu + forest_pred_tau
+                    if self.has_rfx:
+                        rfx_pred = rfx_model.predict(rfx_dataset_train, rfx_tracker)
+                        outcome_pred = outcome_pred + rfx_pred
+                    mu0 = outcome_pred[y_train[:, 0] == 0]
+                    mu1 = outcome_pred[y_train[:, 0] == 1]
                     n0 = np.sum(y_train[:, 0] == 0)
                     n1 = np.sum(y_train[:, 0] == 1)
                     u0 = self.rng.uniform(
@@ -1754,7 +1757,7 @@ def sample(
                     resid_train[y_train[:, 0] == 1, 0] = mu1 + norm.ppf(u1)
 
                     # Update outcome
-                    new_outcome = np.squeeze(resid_train) - forest_pred
+                    new_outcome = np.squeeze(resid_train) - outcome_pred
                     residual_train.update_data(new_outcome)
 
                 # Sample the prognostic forest
@@ -1817,18 +1820,21 @@ def sample(
 
                 # Sample coding parameters (if requested)
                 if self.adaptive_coding:
-                    mu_x = active_forest_mu.predict_raw(forest_dataset_train)
+                    partial_outcome_pred = active_forest_mu.predict_raw(forest_dataset_train)
                     tau_x = np.squeeze(
                         active_forest_tau.predict_raw(forest_dataset_train)
                     )
+                    if self.has_rfx:
+                        rfx_pred = rfx_model.predict(rfx_dataset_train, rfx_tracker)
+                        partial_outcome_pred = partial_outcome_pred + rfx_pred
                     s_tt0 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 0))
                     s_tt1 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 1))
-                    partial_resid_mu = np.squeeze(resid_train - mu_x)
+                    partial_resid = np.squeeze(resid_train - partial_outcome_pred)
                     s_ty0 = np.sum(
-                        tau_x * partial_resid_mu * (np.squeeze(Z_train) == 0)
+                        tau_x * partial_resid * (np.squeeze(Z_train) == 0)
                     )
                     s_ty1 = np.sum(
-                        tau_x * partial_resid_mu * (np.squeeze(Z_train) == 1)
+                        tau_x * partial_resid * (np.squeeze(Z_train) == 1)
                     )
                     current_b_0 = self.rng.normal(
                         loc=(s_ty0 / (s_tt0 + 2 * current_sigma2)),
@@ -1935,9 +1941,12 @@ def sample(
                     # Sample latent probit variable z | -
                     forest_pred_mu = active_forest_mu.predict(forest_dataset_train)
                     forest_pred_tau = active_forest_tau.predict(forest_dataset_train)
-                    forest_pred = forest_pred_mu + forest_pred_tau
-                    mu0 = forest_pred[y_train[:, 0] == 0]
-                    mu1 = forest_pred[y_train[:, 0] == 1]
+                    outcome_pred = forest_pred_mu + forest_pred_tau
+                    if self.has_rfx:
+                        rfx_pred = rfx_model.predict(rfx_dataset_train, rfx_tracker)
+                        outcome_pred = outcome_pred + rfx_pred
+                    mu0 = outcome_pred[y_train[:, 0] == 0]
+                    mu1 = outcome_pred[y_train[:, 0] == 1]
                     n0 = np.sum(y_train[:, 0] == 0)
                     n1 = np.sum(y_train[:, 0] == 1)
                     u0 = self.rng.uniform(
@@ -1954,7 +1963,7 @@ def sample(
                     resid_train[y_train[:, 0] == 1, 0] = mu1 + norm.ppf(u1)
 
                     # Update outcome
-                    new_outcome = np.squeeze(resid_train) - forest_pred
+                    new_outcome = np.squeeze(resid_train) - outcome_pred
                     residual_train.update_data(new_outcome)
 
                 # Sample the prognostic forest
@@ -2017,18 +2026,21 @@ def sample(
 
                 # Sample coding parameters (if requested)
                 if self.adaptive_coding:
-                    mu_x = active_forest_mu.predict_raw(forest_dataset_train)
+                    partial_outcome_pred = active_forest_mu.predict_raw(forest_dataset_train)
                     tau_x = np.squeeze(
                         active_forest_tau.predict_raw(forest_dataset_train)
                     )
+                    if self.has_rfx:
+                        rfx_pred = rfx_model.predict(rfx_dataset_train, rfx_tracker)
+                        partial_outcome_pred = partial_outcome_pred + rfx_pred
                     s_tt0 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 0))
                     s_tt1 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 1))
-                    partial_resid_mu = np.squeeze(resid_train - mu_x)
+                    partial_resid = np.squeeze(resid_train - partial_outcome_pred)
                     s_ty0 = np.sum(
-                        tau_x * partial_resid_mu * (np.squeeze(Z_train) == 0)
+                        tau_x * partial_resid * (np.squeeze(Z_train) == 0)
                     )
                     s_ty1 = np.sum(
-                        tau_x * partial_resid_mu * (np.squeeze(Z_train) == 1)
+                        tau_x * partial_resid * (np.squeeze(Z_train) == 1)
                     )
                     current_b_0 = self.rng.normal(
                         loc=(s_ty0 / (s_tt0 + 2 * current_sigma2)),
@@ -2655,8 +2667,8 @@ def compute_contrast(
 
         # Transform to probability scale if requested
         if probability_scale:
-            treatment_preds = norm.ppf(treatment_preds)
-            control_preds = norm.ppf(control_preds)
+            treatment_preds = norm.cdf(treatment_preds)
+            control_preds = norm.cdf(control_preds)
 
         # Compute and return contrast
         if predict_mean:
diff --git a/stochtree/random_effects.py b/stochtree/random_effects.py
@@ -426,6 +426,28 @@ def sample(
             rng.rng_cpp,
         )
 
+    def predict(
+        self, rfx_dataset: RandomEffectsDataset, rfx_tracker: RandomEffectsTracker
+    ) -> np.ndarray:
+        """
+        Predict random effects for each observation in `rfx_dataset`
+
+        Parameters
+        ----------
+        rfx_dataset: RandomEffectsDataset
+            Object of type `RandomEffectsDataset`
+        rfx_tracker: RandomEffectsTracker
+            Object of type `RandomEffectsTracker`
+
+        Returns
+        -------
+        np.ndarray
+            Numpy array with as many rows as observations in `rfx_dataset` and as many columns as samples in the container
+        """
+        return self.rfx_model_cpp.Predict(
+            rfx_dataset.rfx_dataset_cpp, rfx_tracker.rfx_tracker_cpp
+        )
+
     def set_working_parameter(self, working_parameter: np.ndarray) -> None:
         """
         Set values for the "working parameter." This is typically used for initialization,