Source code for causalml.inference.tree.causal.causaltree

import logging
from typing import Union

import tqdm
import numpy as np
from numpy import float32 as DTYPE

from pathos.pools import ProcessPool as PPool
from scipy.stats import norm
from sklearn.base import RegressorMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from causalml.inference.meta.utils import check_treatment_vector

from ._tree import BaseCausalDecisionTree
from ..utils import get_tree_leaves_mask, timeit

logger = logging.getLogger("causalml")


[docs]class CausalTreeRegressor(RegressorMixin, BaseCausalDecisionTree): """A Causal Tree regressor class. The Causal Tree is a decision tree regressor with a split criteria for treatment effects. Details are available at `Athey and Imbens (2015) <https://arxiv.org/abs/1504.01132)>`_. """ def __init__( self, *, criterion: str = "causal_mse", splitter: str = "best", alpha: float = 0.05, control_name: Union[int, str] = 0, max_depth: int = None, min_samples_split: Union[int, float] = 60, min_weight_fraction_leaf: float = 0.0, max_features: Union[int, float, str] = None, max_leaf_nodes: int = None, min_impurity_decrease: float = float("-inf"), ccp_alpha: float = 0.0, groups_penalty: float = 0.5, min_samples_leaf: int = 100, random_state: int = None, groups_cnt: bool = False, groups_cnt_mode: str = "nodes", ): """ Initialize a Causal Tree Args: criterion: ({"causal_mse", "standard_mse"}, default="causal_mse") The function to measure the quality of a split. splitter: ({"best", "random"}, default="best") The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split. alpha: (float): the confidence level alpha of the ATE estimate and ITE bootstrap estimates control_name: (str or int): name of control group max_depth: (int, default=None) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. min_samples_split: (int or float, default=2) The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. min_weight_fraction_leaf: (float, default=0.0) The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features: (int, float or {"auto", "sqrt", "log2"}, default=None) The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. max_leaf_nodes: (int, default=None) Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. min_impurity_decrease: (float, default=float("-inf"))) A node will be split if this split induces a decrease of the impurity greater than or equal to this value. ccp_alpha: (non-negative float, default=0.0) Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. groups_penalty: (float, default=0.5) This penalty coefficient manages the node impurity increase in case of the difference between treatment and control samples sizes. min_samples_leaf: (int or float), default=100 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. random_state: (int), RandomState instance or None, default=None Used to pick randomly the `max_features` used at each split. See :term:`Glossary <random_state>` for details. groups_cnt: (bool), count treatment and control groups for each node/leaf groups_cnt_mode: (str, 'nodes', 'leaves'), mode for samples counting """ self.criterion = criterion self.splitter = splitter self.alpha = alpha self.control_name = control_name self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.ccp_alpha = ccp_alpha self.groups_penalty = groups_penalty self.min_samples_leaf = min_samples_leaf self.random_state = random_state self.eps = 1e-5 self._classes = {} self.groups_cnt = groups_cnt self.groups_cnt_mode = groups_cnt_mode self._with_outcomes = False self._groups_cnt = {} super().__init__( criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, min_samples_leaf=min_samples_leaf, random_state=random_state, )
[docs] def fit( self, X: np.ndarray, y: np.ndarray, treatment: np.ndarray = None, sample_weight: np.ndarray = None, check_input=False, ): """ Fit CausalTreeRegressor Args: X: : (np.ndarray), feature matrix y: : (np.ndarray), outcome vector treatment: : (np.ndarray), treatment vector sample_weight: (np.ndarray), sample_weight check_input: (bool) Returns: self """ if self.criterion == "causal_mse" and self.min_impurity_decrease != float( "-inf" ): raise ValueError( "min_impurity_decrease must be set to -inf for causal_mse criterion" ) if treatment is None and sample_weight is None: raise ValueError("`treatment` or `sample_weight` must be provided") if treatment is None: X, y, w = X, y, sample_weight else: X, y, w = self._prepare_data(X=X, y=y, treatment=treatment) self.treatment_groups = np.unique(w) super().fit(X=X, y=y, sample_weight=self.eps + w, check_input=check_input) if self.groups_cnt: self._groups_cnt = self._count_groups_distribution(X=X, treatment=w) return self
[docs] def predict( self, X: np.ndarray, with_outcomes: bool = False, check_input=True ) -> np.ndarray: """Predict individual treatment effects Args: X (np.matrix): a feature matrix with_outcomes (bool), default=False, include outcomes Y_hat(X|T=0), Y_hat(X|T=1) along with individual treatment effect check_input (bool), default=True, Allow to bypass several input checking. Returns: (np.matrix): individual treatment effect (ITE), dim=nx1 or ITE with outcomes [Y_hat(X|T=0), Y_hat(X|T=1), ITE], dim=nx3 """ if check_input: X = self._validate_X_predict(X, check_input) y_outcomes = super().predict(X) y_pred = y_outcomes[:, 1] - y_outcomes[:, 0] need_outcomes = with_outcomes or self._with_outcomes return ( np.hstack([y_outcomes, y_pred.reshape(-1, 1)]) if need_outcomes else y_pred )
[docs] def fit_predict( self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray, return_ci: bool = False, n_bootstraps: int = 1000, bootstrap_size: int = 10000, n_jobs: int = 1, verbose: bool = False, ) -> tuple: """Fit the Causal Tree model and predict treatment effects. Args: X (np.matrix): a feature matrix treatment (np.array): a treatment vector y (np.array): an outcome vector return_ci (bool): whether to return confidence intervals n_bootstraps (int): number of bootstrap iterations bootstrap_size (int): number of samples per bootstrap n_jobs (int): the number of jobs for bootstrap verbose (str): whether to output progress logs Returns: (tuple): - te (numpy.ndarray): Predictions of treatment effects. - te_lower (numpy.ndarray, optional): lower bounds of treatment effects - te_upper (numpy.ndarray, optional): upper bounds of treatment effects """ self.fit(X=X, treatment=treatment, y=y) te = self.predict(X=X) if return_ci: te_bootstraps = self.bootstrap_pool( X=X, treatment=treatment, y=y, n_bootstraps=n_bootstraps, bootstrap_size=bootstrap_size, n_jobs=n_jobs, verbose=verbose, ) te_lower = np.percentile(te_bootstraps, (self.alpha / 2) * 100, axis=0) te_upper = np.percentile(te_bootstraps, (1 - self.alpha / 2) * 100, axis=0) return te, te_lower, te_upper else: return te
[docs] def estimate_ate( self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray ) -> tuple: """Estimate the Average Treatment Effect (ATE). Args: X (np.matrix): a feature matrix treatment (np.array): a treatment vector y (np.array): an outcome vector Returns: tuple, The mean and confidence interval (LB, UB) of the ATE estimate. """ dhat = self.fit_predict(X, treatment, y) te = dhat.mean() se = dhat.std() / X.shape[0] te_lb = te - se * norm.ppf(1 - self.alpha / 2) te_ub = te + se * norm.ppf(1 - self.alpha / 2) return te, te_lb, te_ub
@timeit(exclude_kwargs=("X", "treatment", "y")) def bootstrap_pool( self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray, n_bootstraps: int, bootstrap_size: int, n_jobs: int, verbose: bool, ): """ Run a pool of bootstraps Args: X (np.ndarray): a feature matrix treatment (np.ndarray): a treatment vector y (np.ndarray): an outcome vector n_bootstraps (int): number of bootstrap iterations bootstrap_size (int): number of samples per bootstrap n_jobs (int): number of processes verbose (bool): whether to output progress logs Returns: (np.ndarray), bootstrap estimates """ def _bootstrap(i: int): if verbose: logger.info(f"Boostrap iteration: {i}") return self.bootstrap( X=X, treatment=treatment, y=y, sample_size=bootstrap_size, seed=i ) pool = PPool(nodes=n_jobs) pool.restart(force=True) bootstrap_estimates = np.array( list( tqdm.tqdm( pool.imap(_bootstrap, (i for i in range(n_bootstraps))), total=n_bootstraps, ) ) ) pool.close() pool.join() return bootstrap_estimates
[docs] def bootstrap( self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray, sample_size: int, seed: int, ) -> np.ndarray: """Runs a single bootstrap. Fits on bootstrapped sample, then predicts on whole population. Args: X (np.ndarray): a feature matrix treatment (np.ndarray): a treatment vector y (np.ndarray): an outcome vector sample_size (int): bootstrap sample size seed: (int): bootstrap seed Returns: (np.ndarray): bootstrap predictions """ _rnd = np.random.RandomState(seed=seed) idxs = _rnd.choice(np.arange(0, X.shape[0]), size=sample_size) X_b, y_b, treatment_b = X[idxs], y[idxs], treatment[idxs] self.fit(X=X_b, treatment=treatment_b, y=y_b) te_b = self.predict(X=X) return te_b
def _prepare_data( self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray ) -> tuple: """ Prepare input data with treatment info for DecisionTreeRegressor Args: X: : (np.ndarray), feature matrix treatment: : (np.ndarray), treatment vector y: : (np.ndarray), outcome vector Returns: X, y, w """ if y.shape[0] != treatment.shape[0]: raise ValueError( f"The number of `treatment` and `y` rows are not equal: {y.shape[0]} {treatment.shape[0]}" ) check_treatment_vector(treatment, self.control_name) self.is_treatment = treatment != self.control_name w = self.is_treatment.astype(int) X = check_array(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) self.n_samples, self.n_features = X.shape return X, y, w def _count_groups_distribution(self, X: np.ndarray, treatment: np.ndarray) -> dict: """ Count treatment, control distribution for tree nodes/leaves Args: X: (np.ndarray), feature matrix treatment: (np.ndarray), treatment vector Returns: dict: treatment groups for each tree node/leaves """ check_is_fitted(self) self.is_leaves = get_tree_leaves_mask(self) groups_cnt = { idx: {group: 0 for group in self.treatment_groups} for idx in np.array(range(self.tree_.node_count)) } node_indicators = self.tree_.decision_path(X.astype(np.float32)) for sample_id in range(X.shape[0]): nodes_path = node_indicators.indices[ node_indicators.indptr[sample_id] : node_indicators.indptr[ sample_id + 1 ] ] if self.groups_cnt_mode == "leaves": groups_cnt[nodes_path[-1]][treatment[sample_id]] += 1 elif self.groups_cnt_mode == "nodes": for node_id in nodes_path: groups_cnt[node_id][treatment[sample_id]] += 1 return groups_cnt