Source code for causalml.inference.tree.causal.causalforest

from typing import Union

from contextlib import contextmanager
import numpy as np
import forestci as fci
from joblib import Parallel, delayed
from warnings import catch_warnings, simplefilter, warn

from sklearn.exceptions import DataConversionWarning
from sklearn.utils.validation import (
    check_random_state,
    _check_sample_weight,
    validate_data,
)
from sklearn.utils.multiclass import type_of_target
from sklearn import __version__ as sklearn_version
from sklearn.ensemble._forest import MAX_INT
from sklearn.ensemble._forest import ForestRegressor
from sklearn.ensemble._forest import compute_sample_weight, issparse
from sklearn.ensemble._forest import _generate_sample_indices, _get_n_samples_bootstrap

from .causaltree import CausalTreeRegressor
from ._tree import get_check_y_params

try:
    from packaging.version import parse as Version
except ModuleNotFoundError:
    from distutils.version import LooseVersion as Version

DOUBLE = np.float64
DTYPE = np.float32

# scikit-learn 1.9 refactored the forest sample_weight handling
# (https://github.com/scikit-learn/scikit-learn/pull/31529), making `sample_weight`
# a required positional argument on several private helpers vendored below.
# We pass `sample_weight=None` on >=1.9 to preserve the pre-1.9 uniform-bootstrap
# behavior (the vendored `_parallel_build_trees` already applies the user's
# `sample_weight` itself), keeping causal-forest results identical across versions.
_SKLEARN_GE_19 = Version(sklearn_version) >= Version("1.9.0")


@contextmanager
def _forestci_sklearn19_compat():
    """Make forestci's pre-1.9 sampler calls work under scikit-learn >= 1.9.

    forestci (<= 0.7) calls scikit-learn's private ``_generate_sample_indices``
    and ``_get_n_samples_bootstrap`` with their pre-1.9 arity (no
    ``sample_weight``), which raises ``TypeError`` on scikit-learn >= 1.9 (the
    same refactor handled elsewhere in this module). Until the upstream fix
    (scikit-learn-contrib/forest-confidence-interval#122) ships in a release and
    we can bump the ``forestci`` pin, temporarily wrap those names in forestci's
    module namespace to supply ``sample_weight=None`` -- the pre-1.9
    uniform-bootstrap default, so results are unchanged. This patches the exact
    globals ``forestci.calc_inbag`` resolves at call time, covering both the
    direct call and the recursive ``calibrate=True`` path. No-op on < 1.9.

    Note: this mutates ``forestci.forestci`` module globals for the duration of
    the call (restored in ``finally``); it is not safe under concurrent calls
    from multiple threads.
    """
    if not _SKLEARN_GE_19:
        yield
        return

    import forestci.forestci as _impl

    def _gsi(random_state, n_samples, n_samples_bootstrap, sample_weight=None):
        return _generate_sample_indices(
            random_state, n_samples, n_samples_bootstrap, sample_weight
        )

    def _gnsb(n_samples, max_samples, sample_weight=None):
        return _get_n_samples_bootstrap(n_samples, max_samples, sample_weight)

    orig = (_impl._generate_sample_indices, _impl._get_n_samples_bootstrap)
    _impl._generate_sample_indices, _impl._get_n_samples_bootstrap = _gsi, _gnsb
    try:
        yield
    finally:
        _impl._generate_sample_indices, _impl._get_n_samples_bootstrap = orig


if Version(sklearn_version) >= Version("1.1.0"):
    _joblib_parallel_args = dict(prefer="threads")
else:
    from sklearn.utils.fixes import _joblib_parallel_args

    _joblib_parallel_args = _joblib_parallel_args(prefer="threads")


def _parallel_build_trees(
    tree,
    forest,
    X,
    treatment,
    y,
    sample_weight,
    tree_idx,
    n_trees,
    verbose=0,
    class_weight=None,
    n_samples_bootstrap=None,
):
    """
    Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        if _SKLEARN_GE_19:
            # `sample_weight=None` -> uniform draw, matching pre-1.9 behavior
            indices = _generate_sample_indices(
                tree.random_state, n_samples, n_samples_bootstrap, sample_weight=None
            )
        else:
            indices = _generate_sample_indices(
                tree.random_state, n_samples, n_samples_bootstrap
            )
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == "subsample":
            with catch_warnings():
                simplefilter("ignore", DeprecationWarning)
                curr_sample_weight *= compute_sample_weight("auto", y, indices=indices)
        elif class_weight == "balanced_subsample":
            curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices)

        tree.fit(
            X,
            treatment,
            y,
            sample_weight=curr_sample_weight,
            check_input=True,
            prepare_data=False,
        )
    else:
        tree.fit(
            X,
            treatment,
            y,
            sample_weight=sample_weight,
            check_input=True,
            prepare_data=False,
        )

    return tree


[docs] class CausalRandomForestRegressor(ForestRegressor): def __init__( self, n_estimators: int = 100, *, control_name: Union[int, str] = 0, criterion: str = "causal_mse", alpha: float = 0.05, max_depth: int = None, min_samples_split: int = 60, min_samples_leaf: int = 100, min_group_samples: int = 50, min_weight_fraction_leaf: float = 0.0, max_features: Union[int, float, str] = 1.0, max_leaf_nodes: int = None, min_impurity_decrease: float = float("-inf"), bootstrap: bool = True, oob_score: bool = False, n_jobs: int = None, random_state: int = None, verbose: int = 0, warm_start: bool = False, ccp_alpha: float = 0.0, groups_penalty: float = 0.5, max_samples: int = None, groups_cnt: bool = True, groups_cnt_mode: str = "nodes", ): """ Initialize Random Forest of CausalTreeRegressors Args: n_estimators: (int, default=100) Number of trees in the forest control_name: (str or int) Name of control group criterion: ({"causal_mse", "standard_mse"}, default="causal_mse"): Function to measure the quality of a split. alpha: (float) The confidence level alpha of the ATE estimate and ITE bootstrap estimates max_depth: (int, default=None) The maximum depth of the tree. min_samples_split: (int or float, default=2) The minimum number of samples required to split an internal node: min_samples_leaf: (int or float), default=100 The minimum number of samples required to be at a leaf node. min_weight_fraction_leaf: (float, default=0.0) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. max_features: (int, float or {"auto", "sqrt", "log2"}, default=None) The number of features to consider when looking for the best split max_leaf_nodes: (int, default=None) Grow a tree with ``max_leaf_nodes`` in best-first fashion. min_impurity_decrease: (float, default=float("-inf"))) A node will be split if this split induces a decrease of the impurity greater than or equal to this value. bootstrap : (bool, default=True) Whether bootstrap samples are used when building trees. oob_score : bool, default=False Whether to use out-of-bag samples to estimate the generalization score. n_jobs : int, default=None The number of jobs to run in parallel. random_state : (int, RandomState instance or None, default=None) Controls both the randomness of the bootstrapping of the samples used when building trees (if ``bootstrap=True``) and the sampling of the features to consider when looking for the best split at each node (if ``max_features < n_features``). verbose : (int, default=0) Controls the verbosity when fitting and predicting. warm_start : (bool, default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. ccp_alpha : (non-negative float, default=0.0) Complexity parameter used for Minimal Cost-Complexity Pruning. groups_penalty: (float, default=0.5) This penalty coefficient manages the node impurity increase in case of the difference between treatment and control samples sizes. max_samples : (int or float, default=None) If bootstrap is True, the number of samples to draw from X to train each base estimator. groups_cnt: (bool), count treatment and control groups for each node/leaf groups_cnt_mode: (str, 'nodes', 'leaves'), mode for samples counting """ self._estimator = CausalTreeRegressor( control_name=control_name, criterion=criterion, groups_cnt=groups_cnt, groups_cnt_mode=groups_cnt_mode, ) _estimator_key = ( "estimator" if Version(sklearn_version) >= Version("1.2.0") else "base_estimator" ) _parent_args = { _estimator_key: self._estimator, "n_estimators": n_estimators, "estimator_params": ( "criterion", "control_name", "max_depth", "min_samples_split", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "min_impurity_decrease", "ccp_alpha", "groups_penalty", "min_samples_leaf", "min_group_samples", "random_state", ), "bootstrap": bootstrap, "oob_score": oob_score, "n_jobs": n_jobs, "random_state": random_state, "verbose": verbose, "warm_start": warm_start, "max_samples": max_samples, } super().__init__(**_parent_args) self.criterion = criterion self.control_name = control_name self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_group_samples = min_group_samples self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha self.groups_penalty = groups_penalty self.alpha = alpha self.groups_cnt = groups_cnt self.groups_cnt_mode = groups_cnt_mode def _fit( self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None, ): """ Build a forest of trees from the training set (X, y). With modified _parallel_build_trees for Causal Trees used in BaseForest.fit() Source: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_forest.py Parameters ---------- X (np.ndarray): {array-like, sparse matrix} of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. treatment (np.ndarray): treatment vector, includes control group y (np.ndarray): array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight (np.ndarray): array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object Fitted estimator. """ # Validate or convert input data if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = get_check_y_params() X, y = validate_data( self, X, y, multi_output=True, accept_sparse="csc", validate_separately=(check_X_params, check_y_params), ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2, ) if y.ndim == 1: y = np.reshape(y, (-1, 1)) if self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is " "not allowed for Poisson regression." ) if np.sum(y) <= 0: raise ValueError( "Sum of y is not strictly positive which " "is necessary for Poisson regression." ) groups = np.unique(treatment).astype(int).size self.n_outputs_ = groups - 1 self.max_outputs_ = self.n_outputs_ + groups if _SKLEARN_GE_19: # 1.9 requires sample_weight; for a regressor this returns (y, None) # regardless, so threading it through is harmless. y, expanded_class_weight = self._validate_y_class_weight(y, sample_weight) else: y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight if not self.bootstrap and self.max_samples is not None: raise ValueError( "`max_sample` cannot be set if `bootstrap=False`. " "Either switch to `bootstrap=True` or set " "`max_sample=None`." ) elif self.bootstrap: if _SKLEARN_GE_19: n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples, sample_weight=None, ) else: n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples ) else: n_samples_bootstrap = None # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " "len(estimators_)=%d when warm_start==True" % (self.n_estimators, len(self.estimators_)) ) elif n_more_estimators == 0: warn( "Warm-start fitting without increasing n_estimators does not " "fit new trees." ) else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for _ in range(n_more_estimators) ] trees = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args, )( delayed(_parallel_build_trees)( tree=t, forest=self, X=X, treatment=treatment, y=y, sample_weight=sample_weight, tree_idx=i, n_trees=len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, ) for i, t in enumerate(trees) ) self.estimators_.extend(trees) if self.oob_score: y_type = type_of_target(y) if y_type in ("multiclass-multioutput", "unknown"): raise ValueError( "The type of target cannot be used to compute OOB " f"estimates. Got {y_type} while only the following are " "supported: continuous, continuous-multioutput, binary, " "multiclass, multilabel-indicator." ) self._set_oob_score_and_attributes(X, y) if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
[docs] def fit( self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None, ): """ Fit Causal RandomForest Args: X: (np.ndarray), feature matrix treatment: (np.ndarray), treatment vector y: (np.ndarray), outcome vector sample_weight: (np.ndarray), sample weights Returns: self """ X, y = self._estimator._prepare_data(X=X, treatment=treatment, y=y) return self._fit(X=X, treatment=treatment, y=y, sample_weight=sample_weight)
[docs] def predict(self, X: np.ndarray, with_outcomes: bool = False) -> np.ndarray: """Predict individual treatment effects Args: X (np.ndarray): a feature matrix with_outcomes (bool), default=False, include outcomes Y_hat(X|T=0), Y_hat(X|T=1) along with individual treatment effect Returns: (np.ndarray): individual treatment effect (ITE), dim=(samples, groups-1) or ITE with outcomes: [Y_hat(X|T=0), Y_hat(X|T=1),...,Y_hat(X|T=n), ITE_1, ITE_2,...,ITE_n], dim=(samples, 2*groups-1) """ if with_outcomes: self.n_outputs_ = self.max_outputs_ for estimator in self.estimators_: estimator._with_outcomes = True y_pred = super().predict(X) return y_pred
[docs] def calculate_error( self, X_train: np.ndarray, X_test: np.ndarray, inbag: np.ndarray = None, calibrate: bool = True, memory_constrained: bool = False, memory_limit: int = None, ) -> np.ndarray: """ Calculate error bars from scikit-learn RandomForest estimators Source: https://github.com/scikit-learn-contrib/forest-confidence-interval Args: X_train: (np.ndarray), training subsample of feature matrix, (n_train_sample, n_features) X_test: (np.ndarray), test subsample of feature matrix, (n_train_sample, n_features) inbag: (ndarray, optional), The inbag matrix that fit the data. If set to `None` (default) it will be inferred from the forest. However, this only works for trees for which bootstrapping was set to `True`. That is, if sampling was done with replacement. Otherwise, users need to provide their own inbag matrix. calibrate: (boolean, optional) Whether to apply calibration to mitigate Monte Carlo noise. Some variance estimates may be negative due to Monte Carlo effects if the number of trees in the forest is too small. To use calibration, Default: True memory_constrained: (boolean, optional) Whether or not there is a restriction on memory. If False, it is assumed that a ndarray of shape (n_train_sample,n_test_sample) fits in main memory. Setting to True can actually provide a speedup if memory_limit is tuned to the optimal range. memory_limit: (int, optional) An upper bound for how much memory the intermediate matrices will take up in Megabytes. This must be provided if memory_constrained=True. Returns: (np.ndarray), An array with the unbiased sampling variance for a RandomForest object. Note: This method delegates to ``forestci`` (forest-confidence-interval), which calls scikit-learn's private ``_get_n_samples_bootstrap`` and ``_generate_sample_indices`` with their pre-1.9 signatures. Under scikit-learn >= 1.9 those helpers require an extra ``sample_weight`` argument, so ``forestci`` (<= 0.7) would raise ``TypeError`` here. As a temporary measure this call runs inside ``_forestci_sklearn19_compat()``, which patches those helpers to pass ``sample_weight=None`` (the pre-1.9 uniform-bootstrap default, so results are unchanged), pending the upstream fix (forest-confidence-interval#122) and a ``forestci`` pin bump. The shim is not thread-safe (see its docstring). See https://github.com/uber/causalml/issues/906. """ if self.n_outputs_ != 1: raise NotImplementedError( f"forestci supports n_outputs=1. n_outputs={self.n_outputs_}" ) with _forestci_sklearn19_compat(): var = fci.random_forest_error( self, X_train, X_test, inbag=inbag, calibrate=calibrate, memory_constrained=memory_constrained, memory_limit=memory_limit, ) return var