Source code for causalml.dataset.regression

import logging

import numpy as np
from scipy.special import expit, logit

logger = logging.getLogger("causalml")


[docs]def synthetic_data(mode=1, n=1000, p=5, sigma=1.0, adj=0.0): """ Synthetic data in Nie X. and Wager S. (2018) 'Quasi-Oracle Estimation of Heterogeneous Treatment Effects' Args: mode (int, optional): mode of the simulation: \ 1 for difficult nuisance components and an easy treatment effect. \ 2 for a randomized trial. \ 3 for an easy propensity and a difficult baseline. \ 4 for unrelated treatment and control groups. \ 5 for a hidden confounder biasing treatment. n (int, optional): number of observations p (int optional): number of covariates (>=5) sigma (float): standard deviation of the error term adj (float): adjustment term for the distribution of propensity, e. Higher values shift the distribution to 0. It does not apply to mode == 2 or 3. Returns: (tuple): Synthetically generated samples with the following outputs: - y ((n,)-array): outcome variable. - X ((n,p)-ndarray): independent variables. - w ((n,)-array): treatment flag with value 0 or 1. - tau ((n,)-array): individual treatment effect. - b ((n,)-array): expected outcome. - e ((n,)-array): propensity of receiving treatment. """ catalog = { 1: simulate_nuisance_and_easy_treatment, 2: simulate_randomized_trial, 3: simulate_easy_propensity_difficult_baseline, 4: simulate_unrelated_treatment_control, 5: simulate_hidden_confounder, } assert mode in catalog, "Invalid mode {}. Should be one of {}".format( mode, set(catalog) ) return catalog[mode](n, p, sigma, adj)
[docs]def simulate_nuisance_and_easy_treatment(n=1000, p=5, sigma=1.0, adj=0.0): """Synthetic data with a difficult nuisance components and an easy treatment effect From Setup A in Nie X. and Wager S. (2018) 'Quasi-Oracle Estimation of Heterogeneous Treatment Effects' Args: n (int, optional): number of observations p (int optional): number of covariates (>=5) sigma (float): standard deviation of the error term adj (float): adjustment term for the distribution of propensity, e. Higher values shift the distribution to 0. Returns: (tuple): Synthetically generated samples with the following outputs: - y ((n,)-array): outcome variable. - X ((n,p)-ndarray): independent variables. - w ((n,)-array): treatment flag with value 0 or 1. - tau ((n,)-array): individual treatment effect. - b ((n,)-array): expected outcome. - e ((n,)-array): propensity of receiving treatment. """ X = np.random.uniform(size=n * p).reshape((n, -1)) b = ( np.sin(np.pi * X[:, 0] * X[:, 1]) + 2 * (X[:, 2] - 0.5) ** 2 + X[:, 3] + 0.5 * X[:, 4] ) eta = 0.1 e = np.maximum( np.repeat(eta, n), np.minimum(np.sin(np.pi * X[:, 0] * X[:, 1]), np.repeat(1 - eta, n)), ) e = expit(logit(e) - adj) tau = (X[:, 0] + X[:, 1]) / 2 w = np.random.binomial(1, e, size=n) y = b + (w - 0.5) * tau + sigma * np.random.normal(size=n) return y, X, w, tau, b, e
[docs]def simulate_randomized_trial(n=1000, p=5, sigma=1.0, adj=0.0): """Synthetic data of a randomized trial From Setup B in Nie X. and Wager S. (2018) 'Quasi-Oracle Estimation of Heterogeneous Treatment Effects' Args: n (int, optional): number of observations p (int optional): number of covariates (>=5) sigma (float): standard deviation of the error term adj (float): no effect. added for consistency Returns: (tuple): Synthetically generated samples with the following outputs: - y ((n,)-array): outcome variable. - X ((n,p)-ndarray): independent variables. - w ((n,)-array): treatment flag with value 0 or 1. - tau ((n,)-array): individual treatment effect. - b ((n,)-array): expected outcome. - e ((n,)-array): propensity of receiving treatment. """ X = np.random.normal(size=n * p).reshape((n, -1)) b = np.maximum.reduce([np.repeat(0.0, n), X[:, 0] + X[:, 1], X[:, 2]]) + np.maximum( np.repeat(0.0, n), X[:, 3] + X[:, 4] ) e = np.repeat(0.5, n) tau = X[:, 0] + np.log1p(np.exp(X[:, 1])) w = np.random.binomial(1, e, size=n) y = b + (w - 0.5) * tau + sigma * np.random.normal(size=n) return y, X, w, tau, b, e
[docs]def simulate_easy_propensity_difficult_baseline(n=1000, p=5, sigma=1.0, adj=0.0): """Synthetic data with easy propensity and a difficult baseline From Setup C in Nie X. and Wager S. (2018) 'Quasi-Oracle Estimation of Heterogeneous Treatment Effects' Args: n (int, optional): number of observations p (int optional): number of covariates (>=3) sigma (float): standard deviation of the error term adj (float): no effect. added for consistency Returns: (tuple): Synthetically generated samples with the following outputs: - y ((n,)-array): outcome variable. - X ((n,p)-ndarray): independent variables. - w ((n,)-array): treatment flag with value 0 or 1. - tau ((n,)-array): individual treatment effect. - b ((n,)-array): expected outcome. - e ((n,)-array): propensity of receiving treatment. """ X = np.random.normal(size=n * p).reshape((n, -1)) b = 2 * np.log1p(np.exp(X[:, 0] + X[:, 1] + X[:, 2])) e = 1 / (1 + np.exp(X[:, 1] + X[:, 2])) tau = np.repeat(1.0, n) w = np.random.binomial(1, e, size=n) y = b + (w - 0.5) * tau + sigma * np.random.normal(size=n) return y, X, w, tau, b, e
[docs]def simulate_unrelated_treatment_control(n=1000, p=5, sigma=1.0, adj=0.0): """Synthetic data with unrelated treatment and control groups. From Setup D in Nie X. and Wager S. (2018) 'Quasi-Oracle Estimation of Heterogeneous Treatment Effects' Args: n (int, optional): number of observations p (int optional): number of covariates (>=3) sigma (float): standard deviation of the error term adj (float): adjustment term for the distribution of propensity, e. Higher values shift the distribution to 0. Returns: (tuple): Synthetically generated samples with the following outputs: - y ((n,)-array): outcome variable. - X ((n,p)-ndarray): independent variables. - w ((n,)-array): treatment flag with value 0 or 1. - tau ((n,)-array): individual treatment effect. - b ((n,)-array): expected outcome. - e ((n,)-array): propensity of receiving treatment. """ X = np.random.normal(size=n * p).reshape((n, -1)) b = ( np.maximum(np.repeat(0.0, n), X[:, 0] + X[:, 1] + X[:, 2]) + np.maximum(np.repeat(0.0, n), X[:, 3] + X[:, 4]) ) / 2 e = 1 / (1 + np.exp(-X[:, 0]) + np.exp(-X[:, 1])) e = expit(logit(e) - adj) tau = np.maximum(np.repeat(0.0, n), X[:, 0] + X[:, 1] + X[:, 2]) - np.maximum( np.repeat(0.0, n), X[:, 3] + X[:, 4] ) w = np.random.binomial(1, e, size=n) y = b + (w - 0.5) * tau + sigma * np.random.normal(size=n) return y, X, w, tau, b, e
[docs]def simulate_hidden_confounder(n=10000, p=5, sigma=1.0, adj=0.0): """Synthetic dataset with a hidden confounder biasing treatment. From Louizos et al. (2018) "Causal Effect Inference with Deep Latent-Variable Models" Args: n (int, optional): number of observations p (int optional): number of covariates (>=3) sigma (float): standard deviation of the error term adj (float): no effect. added for consistency Returns: (tuple): Synthetically generated samples with the following outputs: - y ((n,)-array): outcome variable. - X ((n,p)-ndarray): independent variables. - w ((n,)-array): treatment flag with value 0 or 1. - tau ((n,)-array): individual treatment effect. - b ((n,)-array): expected outcome. - e ((n,)-array): propensity of receiving treatment. """ z = np.random.binomial(1, 0.5, size=n).astype(np.double) X = np.random.normal(z, 5 * z + 3 * (1 - z), size=(p, n)).T e = 0.75 * z + 0.25 * (1 - z) w = np.random.binomial(1, e) b = expit(3 * (z + 2 * (2 * w - 2))) y = np.random.binomial(1, b) # Compute true ite tau for evaluation (via Monte Carlo approximation). t0_t1 = np.array([[0.0], [1.0]]) y_t0, y_t1 = expit(3 * (z + 2 * (2 * t0_t1 - 2))) tau = y_t1 - y_t0 return y, X, w, tau, b, e