2SLS Benchmarks with NLSYM + Synthetic Datasets
We demonstrate the use of 2SLS from the package to estimate the average treatment effect by semi-synthetic data and full synthetic data.
[1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
[2]:
import os
base_path = os.path.abspath("../")
os.chdir(base_path)
[52]:
import logging
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import sys
from scipy import stats
[39]:
import causalml
from causalml.inference.iv import IVRegressor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
Semi-Synthetic Data from NLSYM
The data generation mechanism is described in Syrgkanis et al “Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments” (2019).
Data Loading
[5]:
df = pd.read_csv("docs/examples/data/card.csv")
[6]:
df.head()
[6]:
id | nearc2 | nearc4 | educ | age | fatheduc | motheduc | weight | momdad14 | sinmom14 | ... | smsa66 | wage | enroll | kww | iq | married | libcrd14 | exper | lwage | expersq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0 | 0 | 7 | 29 | NaN | NaN | 158413 | 1 | 0 | ... | 1 | 548 | 0 | 15.0 | NaN | 1.0 | 0.0 | 16 | 6.306275 | 256 |
1 | 3 | 0 | 0 | 12 | 27 | 8.0 | 8.0 | 380166 | 1 | 0 | ... | 1 | 481 | 0 | 35.0 | 93.0 | 1.0 | 1.0 | 9 | 6.175867 | 81 |
2 | 4 | 0 | 0 | 12 | 34 | 14.0 | 12.0 | 367470 | 1 | 0 | ... | 1 | 721 | 0 | 42.0 | 103.0 | 1.0 | 1.0 | 16 | 6.580639 | 256 |
3 | 5 | 1 | 1 | 11 | 27 | 11.0 | 12.0 | 380166 | 1 | 0 | ... | 1 | 250 | 0 | 25.0 | 88.0 | 1.0 | 1.0 | 10 | 5.521461 | 100 |
4 | 6 | 1 | 1 | 12 | 34 | 8.0 | 7.0 | 367470 | 1 | 0 | ... | 1 | 729 | 0 | 34.0 | 108.0 | 1.0 | 0.0 | 16 | 6.591674 | 256 |
5 rows × 34 columns
[7]:
df.columns.values
[7]:
array(['id', 'nearc2', 'nearc4', 'educ', 'age', 'fatheduc', 'motheduc',
'weight', 'momdad14', 'sinmom14', 'step14', 'reg661', 'reg662',
'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66', 'wage',
'enroll', 'kww', 'iq', 'married', 'libcrd14', 'exper', 'lwage',
'expersq'], dtype=object)
[30]:
data_filter = df['educ'] >= 6
# outcome variable
y=df[data_filter]['lwage'].values
# treatment variable
treatment=df[data_filter]['educ'].values
# instrumental variable
w=df[data_filter]['nearc4'].values
Xdf=df[data_filter][['fatheduc', 'motheduc', 'momdad14', 'sinmom14', 'reg661', 'reg662',
'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66',
'exper', 'expersq']]
Xdf['fatheduc']=Xdf['fatheduc'].fillna(value=Xdf['fatheduc'].mean())
Xdf['motheduc']=Xdf['motheduc'].fillna(value=Xdf['motheduc'].mean())
Xscale=Xdf.copy()
Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']]=StandardScaler().fit_transform(Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']])
X=Xscale.values
[32]:
Xscale.describe()
[32]:
fatheduc | motheduc | momdad14 | sinmom14 | reg661 | reg662 | reg663 | reg664 | reg665 | reg666 | reg667 | reg668 | reg669 | south66 | black | smsa | south | smsa66 | exper | expersq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2.991000e+03 | 2.991000e+03 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2991.000000 | 2.991000e+03 | 2.991000e+03 |
mean | -3.529069e-16 | -1.704346e-15 | 0.790371 | 0.100301 | 0.046807 | 0.161484 | 0.196924 | 0.064527 | 0.205951 | 0.094952 | 0.109997 | 0.028419 | 0.090939 | 0.410899 | 0.231361 | 0.715145 | 0.400201 | 0.651622 | 4.285921e-16 | 3.040029e-17 |
std | 1.000167e+00 | 1.000167e+00 | 0.407112 | 0.300451 | 0.211261 | 0.368039 | 0.397741 | 0.245730 | 0.404463 | 0.293197 | 0.312938 | 0.166193 | 0.287571 | 0.492079 | 0.421773 | 0.451421 | 0.490021 | 0.476536 | 1.000167e+00 | 1.000167e+00 |
min | -3.101056e+00 | -3.502453e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -2.159127e+00 | -1.147691e+00 |
25% | -6.303764e-01 | -4.656485e-01 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -6.858865e-01 | -7.077287e-01 |
50% | 0.000000e+00 | 2.091970e-01 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | -1.948066e-01 | -3.655360e-01 |
75% | 6.049634e-01 | 5.466197e-01 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 5.418134e-01 | 3.310707e-01 |
max | 2.457973e+00 | 2.571156e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.242753e+00 | 4.767355e+00 |
Semi-Synthetic Data Generation
[29]:
def semi_synth_nlsym(X, w, random_seed=None):
np.random.seed(random_seed)
nobs = X.shape[0]
nv = np.random.uniform(0, 1, size=nobs)
c0 = np.random.uniform(0.2, 0.3)
C = c0 * X[:,1]
# Treatment compliance depends on mother education
treatment = C * w + X[:,1] + nv
# Treatment effect depends no mother education and single-mom family at age 14
theta = 0.1 + 0.05 * X[:,1] - 0.1*X[:,3]
# Additional effect on the outcome from mother education
f = 0.05 * X[:,1]
y = theta * (treatment + nv) + f + np.random.normal(0, 0.1, size=nobs)
return y, treatment, theta
[33]:
y_sim, treatment_sim, theta = semi_synth_nlsym(Xdf.values, w)
Estimation
[36]:
# True value
theta.mean()
[36]:
0.6089706667314586
[38]:
# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(X, treatment_sim, y_sim, w)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)
[38]:
(0.6611532131769402, 0.013922622951893662)
[51]:
# OLS estimate
ols_fit=sm.OLS(y_sim, sm.add_constant(np.c_[treatment_sim, X], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])
[51]:
(0.7501211540497275, 0.012800163754977008)
Pure Synthetic Data
The data generation mechanism is described in Hong et al “Semiparametric Efficiency in Nonlinear LATE Models” (2010).
Data Generation
[54]:
def synthetic_data(n=10000, random_seed=None):
np.random.seed(random_seed)
gamma0 = -0.5
gamma1 = 1.0
delta = 1.0
x = np.random.uniform(size=n)
v = np.random.normal(size=n)
d1 = (gamma0 + x*gamma1 + delta + v>=0).astype(float)
d0 = (gamma0 + x*gamma1 + v>=0).astype(float)
alpha = 1.0
beta = 0.5
lambda11 = 2.0
lambda00 = 1.0
xi1 = np.random.poisson(np.exp(alpha+x*beta))
xi2 = np.random.poisson(np.exp(x*beta))
xi3 = np.random.poisson(np.exp(lambda11), size=n)
xi4 = np.random.poisson(np.exp(lambda00), size=n)
y1 = xi1 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))
y0 = xi2 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))
z = np.random.binomial(1, stats.norm.cdf(x))
d = d1*z + d0*(1-z)
y = y1*d + y0*(1-d)
return y, x, d, z, y1[(d1>d0)].mean()-y0[(d1>d0)].mean()
[55]:
y, x, d, z, late = synthetic_data()
Estimation
[56]:
# True value
late
[56]:
2.1789099526066353
[57]:
# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(x, d, y, z)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)
[57]:
(2.1900472390231775, 0.2623695460540134)
[59]:
# OLS estimate
ols_fit=sm.OLS(y, sm.add_constant(np.c_[d, x], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])
[59]:
(5.3482879532439975, 0.09201397327077365)
[ ]: