2SLS Benchmarks with NLSYM + Synthetic Datasets

We demonstrate the use of 2SLS from the package to estimate the average treatment effect by semi-synthetic data and full synthetic data.

[1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
[2]:
import os
base_path = os.path.abspath("../")
os.chdir(base_path)
[52]:
import logging
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import sys
from scipy import stats
[39]:
import causalml
from causalml.inference.iv import IVRegressor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

Semi-Synthetic Data from NLSYM

The data generation mechanism is described in Syrgkanis et al “Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments” (2019).

Data Loading

[5]:
df = pd.read_csv("docs/examples/data/card.csv")
[6]:
df.head()
[6]:
id nearc2 nearc4 educ age fatheduc motheduc weight momdad14 sinmom14 ... smsa66 wage enroll kww iq married libcrd14 exper lwage expersq
0 2 0 0 7 29 NaN NaN 158413 1 0 ... 1 548 0 15.0 NaN 1.0 0.0 16 6.306275 256
1 3 0 0 12 27 8.0 8.0 380166 1 0 ... 1 481 0 35.0 93.0 1.0 1.0 9 6.175867 81
2 4 0 0 12 34 14.0 12.0 367470 1 0 ... 1 721 0 42.0 103.0 1.0 1.0 16 6.580639 256
3 5 1 1 11 27 11.0 12.0 380166 1 0 ... 1 250 0 25.0 88.0 1.0 1.0 10 5.521461 100
4 6 1 1 12 34 8.0 7.0 367470 1 0 ... 1 729 0 34.0 108.0 1.0 0.0 16 6.591674 256

5 rows × 34 columns

[7]:
df.columns.values

[7]:
array(['id', 'nearc2', 'nearc4', 'educ', 'age', 'fatheduc', 'motheduc',
       'weight', 'momdad14', 'sinmom14', 'step14', 'reg661', 'reg662',
       'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
       'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66', 'wage',
       'enroll', 'kww', 'iq', 'married', 'libcrd14', 'exper', 'lwage',
       'expersq'], dtype=object)
[30]:
data_filter = df['educ'] >= 6
# outcome variable
y=df[data_filter]['lwage'].values
# treatment variable
treatment=df[data_filter]['educ'].values
# instrumental variable
w=df[data_filter]['nearc4'].values

Xdf=df[data_filter][['fatheduc', 'motheduc', 'momdad14', 'sinmom14', 'reg661', 'reg662',
      'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
      'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66',
      'exper', 'expersq']]
Xdf['fatheduc']=Xdf['fatheduc'].fillna(value=Xdf['fatheduc'].mean())
Xdf['motheduc']=Xdf['motheduc'].fillna(value=Xdf['motheduc'].mean())
Xscale=Xdf.copy()
Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']]=StandardScaler().fit_transform(Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']])
X=Xscale.values
[32]:
Xscale.describe()

[32]:
fatheduc motheduc momdad14 sinmom14 reg661 reg662 reg663 reg664 reg665 reg666 reg667 reg668 reg669 south66 black smsa south smsa66 exper expersq
count 2.991000e+03 2.991000e+03 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2991.000000 2.991000e+03 2.991000e+03
mean -3.529069e-16 -1.704346e-15 0.790371 0.100301 0.046807 0.161484 0.196924 0.064527 0.205951 0.094952 0.109997 0.028419 0.090939 0.410899 0.231361 0.715145 0.400201 0.651622 4.285921e-16 3.040029e-17
std 1.000167e+00 1.000167e+00 0.407112 0.300451 0.211261 0.368039 0.397741 0.245730 0.404463 0.293197 0.312938 0.166193 0.287571 0.492079 0.421773 0.451421 0.490021 0.476536 1.000167e+00 1.000167e+00
min -3.101056e+00 -3.502453e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -2.159127e+00 -1.147691e+00
25% -6.303764e-01 -4.656485e-01 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -6.858865e-01 -7.077287e-01
50% 0.000000e+00 2.091970e-01 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 -1.948066e-01 -3.655360e-01
75% 6.049634e-01 5.466197e-01 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 1.000000 1.000000 5.418134e-01 3.310707e-01
max 2.457973e+00 2.571156e+00 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3.242753e+00 4.767355e+00

Semi-Synthetic Data Generation

[29]:
def semi_synth_nlsym(X, w, random_seed=None):
    np.random.seed(random_seed)
    nobs = X.shape[0]
    nv = np.random.uniform(0, 1, size=nobs)
    c0 = np.random.uniform(0.2, 0.3)
    C = c0 * X[:,1]
    # Treatment compliance depends on mother education
    treatment = C * w + X[:,1] + nv
    # Treatment effect depends no mother education and single-mom family at age 14
    theta = 0.1 + 0.05 * X[:,1] - 0.1*X[:,3]
    # Additional effect on the outcome from mother education
    f = 0.05 * X[:,1]
    y = theta * (treatment + nv) + f + np.random.normal(0, 0.1, size=nobs)

    return y, treatment, theta
[33]:
y_sim, treatment_sim, theta = semi_synth_nlsym(Xdf.values, w)

Estimation

[36]:
# True value
theta.mean()
[36]:
0.6089706667314586
[38]:
# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(X, treatment_sim, y_sim, w)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)
[38]:
(0.6611532131769402, 0.013922622951893662)
[51]:
# OLS estimate
ols_fit=sm.OLS(y_sim, sm.add_constant(np.c_[treatment_sim, X], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])
[51]:
(0.7501211540497275, 0.012800163754977008)

Pure Synthetic Data

The data generation mechanism is described in Hong et al “Semiparametric Efficiency in Nonlinear LATE Models” (2010).

Data Generation

[54]:
def synthetic_data(n=10000, random_seed=None):
    np.random.seed(random_seed)
    gamma0 = -0.5
    gamma1 = 1.0
    delta = 1.0
    x = np.random.uniform(size=n)
    v = np.random.normal(size=n)
    d1 = (gamma0 + x*gamma1 + delta + v>=0).astype(float)
    d0 = (gamma0 + x*gamma1 + v>=0).astype(float)

    alpha = 1.0
    beta = 0.5
    lambda11 = 2.0
    lambda00 = 1.0
    xi1 = np.random.poisson(np.exp(alpha+x*beta))
    xi2 = np.random.poisson(np.exp(x*beta))
    xi3 = np.random.poisson(np.exp(lambda11), size=n)
    xi4 = np.random.poisson(np.exp(lambda00), size=n)

    y1 = xi1 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))
    y0 = xi2 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))

    z = np.random.binomial(1, stats.norm.cdf(x))
    d = d1*z + d0*(1-z)
    y = y1*d + y0*(1-d)

    return y, x, d, z, y1[(d1>d0)].mean()-y0[(d1>d0)].mean()
[55]:
y, x, d, z, late = synthetic_data()

Estimation

[56]:
# True value
late
[56]:
2.1789099526066353
[57]:
# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(x, d, y, z)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)
[57]:
(2.1900472390231775, 0.2623695460540134)
[59]:
# OLS estimate
ols_fit=sm.OLS(y, sm.add_constant(np.c_[d, x], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])
[59]:
(5.3482879532439975, 0.09201397327077365)
[ ]: