reduce tree size before saving them (#33)

aloctavodia · web-flow · commit a70fad415240 · 2022-03-21T21:10:14.000+02:00
diff --git a/pymc_experimental/bart/pgbart.py b/pymc_experimental/bart/pgbart.py
@@ -55,7 +55,14 @@ class PGBART(ArrayStepShared):
     generates_stats = True
     stats_dtypes = [{"variable_inclusion": object, "bart_trees": object}]
 
-    def __init__(self, vars=None, num_particles=40, max_stages=100, batch="auto", model=None):
+    def __init__(
+        self,
+        vars=None,
+        num_particles=40,
+        max_stages=100,
+        batch="auto",
+        model=None,
+    ):
         model = modelcontext(model)
         initial_values = model.compute_initial_point()
         if vars is None:
@@ -135,7 +142,7 @@ def astep(self, _):
             # at the end of the algorithm we return one of these particles as the new tree
             particles = self.init_particles(tree_id)
             # Compute the sum of trees without the old tree, that we are attempting to replace
-            self.sum_trees_noi = self.sum_trees - particles[0].tree.predict_output()
+            self.sum_trees_noi = self.sum_trees - particles[0].tree._predict()
             # Resample leaf values for particle 1 which is a copy of the old tree
             particles[1].sample_leafs(
                 self.sum_trees,
@@ -191,10 +198,11 @@ def astep(self, _):
             # Get the new tree and update
             new_particle = np.random.choice(particles, p=normalized_weights)
             new_tree = new_particle.tree
-            self.all_trees[tree_id] = new_tree
+
             new_particle.log_weight = new_particle.old_likelihood_logp - self.log_num_particles
             self.all_particles[tree_id] = new_particle
-            self.sum_trees = self.sum_trees_noi + new_tree.predict_output()
+            self.sum_trees = self.sum_trees_noi + new_tree._predict()
+            self.all_trees[tree_id] = new_tree.trim()
 
             if self.tune:
                 self.ssv = SampleSplittingVariable(self.alpha_vec)
@@ -239,7 +247,7 @@ def update_weight(self, particle, old=False):
         Since the prior is used as the proposal,the weights are updated additively as the ratio of
         the new and old log-likelihoods.
         """
-        new_likelihood = self.likelihood_logp(self.sum_trees_noi + particle.tree.predict_output())
+        new_likelihood = self.likelihood_logp(self.sum_trees_noi + particle.tree._predict())
         if old:
             particle.log_weight = new_likelihood
             particle.old_likelihood_logp = new_likelihood
diff --git a/pymc_experimental/bart/tree.py b/pymc_experimental/bart/tree.py
@@ -74,22 +74,28 @@ def delete_node(self, index):
             self.idx_leaf_nodes.remove(index)
         del self.tree_structure[index]
 
-    def predict_output(self, excluded=None):
+    def trim(self):
+        a_tree = self.copy()
+        del a_tree.num_observations
+        del a_tree.idx_leaf_nodes
+        for k, v in a_tree.tree_structure.items():
+            current_node = a_tree[k]
+            del current_node.depth
+            if isinstance(current_node, LeafNode):
+                del current_node.idx_data_points
+        return a_tree
+
+    def _predict(self):
         output = np.zeros(self.num_observations)
         for node_index in self.idx_leaf_nodes:
             leaf_node = self.get_node(node_index)
-            if excluded is None:
-                output[leaf_node.idx_data_points] = leaf_node.value
-            else:
-                parent_node = leaf_node.get_idx_parent_node()
-                if self.get_node(parent_node).idx_split_variable not in excluded:
-                    output[leaf_node.idx_data_points] = leaf_node.value
+            output[leaf_node.idx_data_points] = leaf_node.value
 
         return output.astype(aesara.config.floatX)
 
-    def predict_out_of_sample(self, X, excluded=None):
+    def predict(self, X, excluded=None):
         """
-        Predict output of tree for an unobserved point x.
+        Predict output of tree for an (un)observed point X.
 
         Parameters
         ----------
diff --git a/pymc_experimental/bart/utils.py b/pymc_experimental/bart/utils.py
@@ -10,20 +10,21 @@
 from scipy.stats import pearsonr
 
 
-def predict(idata, rng, X_new=None, size=None, excluded=None):
+def predict(idata, rng, X=None, size=None, excluded=None):
     """
     Generate samples from the BART-posterior.
 
     Parameters
     ----------
-    idata: InferenceData
+    idata : InferenceData
         InferenceData containing a collection of BART_trees in sample_stats group
     rng: NumPy random generator
-    X_new : array-like
-        A new covariate matrix. Use it to obtain out-of-sample predictions
-    size: int or tuple
+    X : array-like
+        A covariate matrix. Use the same used to fit BART for in-sample predictions or a new one for
+        out-of-sample predictions.
+    size : int or tuple
         Number of samples.
-    excluded: list
+    excluded : list
         indexes of the variables to exclude when computing predictions
     """
     bart_trees = idata.sample_stats.bart_trees
@@ -39,16 +40,10 @@ def predict(idata, rng, X_new=None, size=None, excluded=None):
 
     idx = rng.randint(len(stacked_trees.trees), size=flatten_size)
 
-    if X_new is None:
-        pred = np.zeros((flatten_size, stacked_trees[0, 0].item().num_observations))
-        for ind, p in enumerate(pred):
-            for tree in stacked_trees.isel(trees=idx[ind]).values:
-                p += tree.predict_output(excluded=excluded)
-    else:
-        pred = np.zeros((flatten_size, X_new.shape[0]))
-        for ind, p in enumerate(pred):
-            for tree in stacked_trees.isel(trees=idx[ind]).values:
-                p += np.array([tree.predict_out_of_sample(x, excluded) for x in X_new])
+    pred = np.zeros((flatten_size, X.shape[0]))
+    for ind, p in enumerate(pred):
+        for tree in stacked_trees.isel(trees=idx[ind]).values:
+            p += np.array([tree.predict(x, excluded) for x in X])
     return pred.reshape((*size, -1))
 
 
@@ -210,13 +205,13 @@ def plot_dependence(
             for x_i in new_X_i:
                 new_X[:, indices_mi] = X[:, indices_mi]
                 new_X[:, i] = x_i
-                y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 1))
+                y_pred.append(np.mean(predict(idata, rng, X=new_X, size=samples), 1))
             new_X_target.append(new_X_i)
         else:
             for instance in instances:
                 new_X = X[idx_s]
                 new_X[:, indices_mi] = X[:, indices_mi][instance]
-                y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 0))
+                y_pred.append(np.mean(predict(idata, rng, X=new_X, size=samples), 0))
             new_X_target.append(new_X[:, i])
         y_mins.append(np.min(y_pred))
         new_Y.append(np.array(y_pred).T)
@@ -302,16 +297,21 @@ def plot_dependence(
     return axes
 
 
-def plot_variable_importance(idata, labels=None, figsize=None, samples=100, random_seed=None):
+def plot_variable_importance(
+    idata, X=None, labels=None, figsize=None, samples=100, random_seed=None
+):
     """
     Estimates variable importance from the BART-posterior.
 
     Parameters
     ----------
     idata: InferenceData
         InferenceData containing a collection of BART_trees in sample_stats group
+    X : array-like
+        The covariate matrix.
     labels: list
-        List of the names of the covariates.
+        List of the names of the covariates. If X is a DataFrame the names of the covariables will
+        be taken from it and this argument will be ignored.
     figsize : tuple
         Figure size. If None it will be defined automatically.
     samples : int
@@ -326,6 +326,10 @@ def plot_variable_importance(idata, labels=None, figsize=None, samples=100, rand
     rng = RandomState(seed=random_seed)
     _, axes = plt.subplots(2, 1, figsize=figsize)
 
+    if hasattr(X, "columns") and hasattr(X, "values"):
+        labels = list(X.columns)
+        X = X.values
+
     VI = idata.sample_stats["variable_inclusion"].mean(("chain", "draw")).values
     if labels is None:
         labels = range(len(VI))
@@ -341,12 +345,12 @@ def plot_variable_importance(idata, labels=None, figsize=None, samples=100, rand
     axes[0].set_xlabel("variable index")
     axes[0].set_ylabel("relative importance")
 
-    predicted_all = predict(idata, rng, size=samples, excluded=None)
+    predicted_all = predict(idata, rng, X=X, size=samples, excluded=None)
 
     EV_mean = np.zeros(len(VI))
     EV_hdi = np.zeros((len(VI), 2))
     for idx, subset in enumerate(subsets):
-        predicted_subset = predict(idata, rng, size=samples, excluded=subset)
+        predicted_subset = predict(idata, rng, X=X, size=samples, excluded=subset)
         pearson = np.zeros(samples)
         for j in range(samples):
             pearson[j] = pearsonr(predicted_all[j], predicted_subset[j])[0]
diff --git a/pymc_experimental/tests/test_bart.py b/pymc_experimental/tests/test_bart.py
@@ -77,9 +77,9 @@ class TestUtils:
 
     def test_predict(self):
         rng = RandomState(12345)
-        pred_all = pmx.bart.utils.predict(self.idata, rng, size=2)
+        pred_all = pmx.bart.utils.predict(self.idata, rng, X=self.X, size=2)
         rng = RandomState(12345)
-        pred_first = pmx.bart.utils.predict(self.idata, rng, X_new=self.X[:10])
+        pred_first = pmx.bart.utils.predict(self.idata, rng, X=self.X[:10])
 
         assert_almost_equal(pred_first, pred_all[0, :10], decimal=4)
         assert pred_all.shape == (2, 50)
@@ -112,7 +112,7 @@ def test_pdp(self, kwargs):
         ],
     )
     def test_vi(self, kwargs):
-        pmx.bart.utils.plot_variable_importance(self.idata, **kwargs)
+        pmx.bart.utils.plot_variable_importance(self.idata, X=self.X, **kwargs)
 
     def test_pdp_pandas_labels(self):
         pd = pytest.importorskip("pandas")