2SLS Benchmarks with NLSYM + Synthetic Datasets

We demonstrate the use of 2SLS from the package to estimate the average treatment effect by semi-synthetic data and full synthetic data.

[1]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

[2]:

import os
base_path = os.path.abspath("../")
os.chdir(base_path)

[52]:

import logging
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import sys
from scipy import stats

[39]:

import causalml
from causalml.inference.iv import IVRegressor
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

Semi-Synthetic Data from NLSYM

The data generation mechanism is described in Syrgkanis et al “Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments” (2019).

Data Loading

[5]:

df = pd.read_csv("examples/data/card.csv")

[6]:

df.head()

[6]:

	id	nearc2	nearc4	educ	age	fatheduc	motheduc	weight	momdad14	...	smsa66	wage	kww	iq	married	libcrd14	exper	lwage	expersq
0	2	0	0	7	29	NaN	NaN	158413	1	...	1	548	15.0	NaN	1.0	0.0	16	6.306275	256
1	3	0	0	12	27	8.0	8.0	380166	1	...	1	481	35.0	93.0	1.0	1.0	9	6.175867	81
2	4	0	0	12	34	14.0	12.0	367470	1	...	1	721	42.0	103.0	1.0	1.0	16	6.580639	256
3	5	1	1	11	27	11.0	12.0	380166	1	...	1	250	25.0	88.0	1.0	1.0	10	5.521461	100
4	6	1	1	12	34	8.0	7.0	367470	1	...	1	729	34.0	108.0	1.0	0.0	16	6.591674	256

5 rows × 34 columns

[7]:

df.columns.values

[7]:

array(['id', 'nearc2', 'nearc4', 'educ', 'age', 'fatheduc', 'motheduc',
       'weight', 'momdad14', 'sinmom14', 'step14', 'reg661', 'reg662',
       'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
       'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66', 'wage',
       'enroll', 'kww', 'iq', 'married', 'libcrd14', 'exper', 'lwage',
       'expersq'], dtype=object)

[30]:

data_filter = df['educ'] >= 6
# outcome variable
y=df[data_filter]['lwage'].values
# treatment variable
treatment=df[data_filter]['educ'].values
# instrumental variable
w=df[data_filter]['nearc4'].values

Xdf=df[data_filter][['fatheduc', 'motheduc', 'momdad14', 'sinmom14', 'reg661', 'reg662',
      'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
      'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66',
      'exper', 'expersq']]
Xdf['fatheduc']=Xdf['fatheduc'].fillna(value=Xdf['fatheduc'].mean())
Xdf['motheduc']=Xdf['motheduc'].fillna(value=Xdf['motheduc'].mean())
Xscale=Xdf.copy()
Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']]=StandardScaler().fit_transform(Xscale[['fatheduc', 'motheduc', 'exper', 'expersq']])
X=Xscale.values

[32]:

Xscale.describe()

[32]:

	fatheduc	motheduc	momdad14	sinmom14	reg661	reg662	reg663	reg664	reg665	reg666	reg667	reg668	reg669	south66	black	smsa	south	smsa66	exper	expersq
count	2.991000e+03	2.991000e+03	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2991.000000	2.991000e+03	2.991000e+03
mean	-3.529069e-16	-1.704346e-15	0.790371	0.100301	0.046807	0.161484	0.196924	0.064527	0.205951	0.094952	0.109997	0.028419	0.090939	0.410899	0.231361	0.715145	0.400201	0.651622	4.285921e-16	3.040029e-17
std	1.000167e+00	1.000167e+00	0.407112	0.300451	0.211261	0.368039	0.397741	0.245730	0.404463	0.293197	0.312938	0.166193	0.287571	0.492079	0.421773	0.451421	0.490021	0.476536	1.000167e+00	1.000167e+00
min	-3.101056e+00	-3.502453e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-2.159127e+00	-1.147691e+00
25%	-6.303764e-01	-4.656485e-01	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-6.858865e-01	-7.077287e-01
50%	0.000000e+00	2.091970e-01	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	-1.948066e-01	-3.655360e-01
75%	6.049634e-01	5.466197e-01	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	1.000000	1.000000	5.418134e-01	3.310707e-01
max	2.457973e+00	2.571156e+00	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	3.242753e+00	4.767355e+00

Semi-Synthetic Data Generation

[29]:

def semi_synth_nlsym(X, w, random_seed=None):
    np.random.seed(random_seed)
    nobs = X.shape[0]
    nv = np.random.uniform(0, 1, size=nobs)
    c0 = np.random.uniform(0.2, 0.3)
    C = c0 * X[:,1]
    # Treatment compliance depends on mother education
    treatment = C * w + X[:,1] + nv
    # Treatment effect depends no mother education and single-mom family at age 14
    theta = 0.1 + 0.05 * X[:,1] - 0.1*X[:,3]
    # Additional effect on the outcome from mother education
    f = 0.05 * X[:,1]
    y = theta * (treatment + nv) + f + np.random.normal(0, 0.1, size=nobs)

    return y, treatment, theta

[33]:

y_sim, treatment_sim, theta = semi_synth_nlsym(Xdf.values, w)

Estimation

[36]:

# True value
theta.mean()

[36]:

0.6089706667314586

[38]:

# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(X, treatment_sim, y_sim, w)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)

[38]:

(0.6611532131769402, 0.013922622951893662)

[51]:

# OLS estimate
ols_fit=sm.OLS(y_sim, sm.add_constant(np.c_[treatment_sim, X], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])

[51]:

(0.7501211540497275, 0.012800163754977008)

Pure Synthetic Data

The data generation mechanism is described in Hong et al “Semiparametric Efficiency in Nonlinear LATE Models” (2010).

Data Generation

[54]:

def synthetic_data(n=10000, random_seed=None):
    np.random.seed(random_seed)
    gamma0 = -0.5
    gamma1 = 1.0
    delta = 1.0
    x = np.random.uniform(size=n)
    v = np.random.normal(size=n)
    d1 = (gamma0 + x*gamma1 + delta + v>=0).astype(float)
    d0 = (gamma0 + x*gamma1 + v>=0).astype(float)

    alpha = 1.0
    beta = 0.5
    lambda11 = 2.0
    lambda00 = 1.0
    xi1 = np.random.poisson(np.exp(alpha+x*beta))
    xi2 = np.random.poisson(np.exp(x*beta))
    xi3 = np.random.poisson(np.exp(lambda11), size=n)
    xi4 = np.random.poisson(np.exp(lambda00), size=n)

    y1 = xi1 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))
    y0 = xi2 + xi3 * ((d1==1) & (d0==1)) + xi4 * ((d1==0) & (d0==0))

    z = np.random.binomial(1, stats.norm.cdf(x))
    d = d1*z + d0*(1-z)
    y = y1*d + y0*(1-d)

    return y, x, d, z, y1[(d1>d0)].mean()-y0[(d1>d0)].mean()

[55]:

y, x, d, z, late = synthetic_data()

Estimation

[56]:

# True value
late

[56]:

2.1789099526066353

[57]:

# 2SLS estimate
iv_fit = IVRegressor()
iv_fit.fit(x, d, y, z)
ate, ate_sd = iv_fit.predict()
(ate, ate_sd)

[57]:

(2.1900472390231775, 0.2623695460540134)

[59]:

# OLS estimate
ols_fit=sm.OLS(y, sm.add_constant(np.c_[d, x], prepend=False)).fit()
(ols_fit.params[0], ols_fit.bse[0])

[59]:

(5.3482879532439975, 0.09201397327077365)

[ ]: