Source code for causalml.dataset.classification

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification


[docs]def make_uplift_classification(n_samples=1000,
                               treatment_name=['control', 'treatment1', 'treatment2', 'treatment3'],
                               y_name='conversion',
                               n_classification_features=10,
                               n_classification_informative=5,
                               n_classification_redundant=0,
                               n_classification_repeated=0,
                               n_uplift_increase_dict={'treatment1': 2, 'treatment2': 2, 'treatment3': 2},
                               n_uplift_decrease_dict={'treatment1': 0, 'treatment2': 0, 'treatment3': 0},
                               delta_uplift_increase_dict={'treatment1': 0.02, 'treatment2': 0.05, 'treatment3': 0.1},
                               delta_uplift_decrease_dict={'treatment1': 0., 'treatment2': 0., 'treatment3': 0.},
                               n_uplift_increase_mix_informative_dict={'treatment1': 1, 'treatment2': 1, 'treatment3': 1},
                               n_uplift_decrease_mix_informative_dict={'treatment1': 0, 'treatment2': 0, 'treatment3': 0},
                               positive_class_proportion=0.5,
                               random_seed=20190101):
    """Generate a synthetic dataset for classification uplift modeling problem.

    Parameters
    ----------
    n_samples : int, optional (default=1000)
        The number of samples to be generated for each treatment group.
    treatment_name: list, optional (default = ['control','treatment1','treatment2','treatment3'])
        The list of treatment names.
    y_name: string, optional (default = 'conversion')
        The name of the outcome variable to be used as a column in the output dataframe.
    n_classification_features: int, optional (default = 10)
        Total number of features for base classification
    n_classification_informative: int, optional (default = 5)
        Total number of informative features for base classification
    n_classification_redundant: int, optional (default = 0)
        Total number of redundant features for base classification
    n_classification_repeated: int, optional (default = 0)
        Total number of repeated features for base classification
    n_uplift_increase_dict: dictionary, optional (default: {'treatment1': 2, 'treatment2': 2, 'treatment3': 2})
        Number of features for generating positive treatment effects for corresponding treatment group.
        Dictionary of {treatment_key: number_of_features_for_increase_uplift}.
    n_uplift_decrease_dict: dictionary, optional (default: {'treatment1': 0, 'treatment2': 0, 'treatment3': 0})
        Number of features for generating negative treatment effects for corresponding treatment group.
        Dictionary of {treatment_key: number_of_features_for_increase_uplift}.
    delta_uplift_increase_dict: dictionary, optional (default: {'treatment1': .02, 'treatment2': .05, 'treatment3': .1})
        Positive treatment effect created by the positive uplift features on the base classification label.
        Dictionary of {treatment_key: increase_delta}.
    delta_uplift_decrease_dict: dictionary, optional (default: {'treatment1': 0., 'treatment2': 0., 'treatment3': 0.})
        Negative treatment effect created by the negative uplift features on the base classification label.
        Dictionary of {treatment_key: increase_delta}.
    n_uplift_increase_mix_informative_dict: dictionary, optional (default: {'treatment1': 1, 'treatment2': 1, 'treatment3': 1})
        Number of positive mix features for each treatment. The positive mix feature is defined as a linear combination
        of a randomly selected informative classification feature and a randomly selected positive uplift feature.
        The linear combination is made by two coefficients sampled from a uniform distribution between -1 and 1.
    n_uplift_decrease_mix_informative_dict: dictionary, optional (default: {'treatment1': 0, 'treatment2': 0, 'treatment3': 0})
        Number of negative mix features for each treatment. The negative mix feature is defined as a linear combination
        of a randomly selected informative classification feature and a randomly selected negative uplift feature. The
        linear combination is made by two coefficients sampled from a uniform distribution between -1 and 1.
    positive_class_proportion: float, optional (default = 0.5)
        The proportion of positive label (1) in the control group.
    random_seed : int, optional (default = 20190101)
        The random seed to be used in the data generation process.

    Returns
    -------
    df_res : DataFrame
        A data frame containing the treatment label, features, and outcome variable.
    x_name : list
        The list of feature names generated.

    Notes
    -----
    The algorithm for generating the base classification dataset is adapted from the make_classification method in the
    sklearn package, that uses the algorithm in Guyon [1] designed to generate the "Madelon" dataset.

    References
    ----------
    .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
            selection benchmark", 2003.
    """
    # set seed
    np.random.seed(seed=random_seed)

    # create data frame
    df_res = pd.DataFrame()

    # generate treatment key
    n_all = n_samples * len(treatment_name)
    treatment_list = []
    for ti in treatment_name:
        treatment_list += [ti] * n_samples
    treatment_list = np.random.permutation(treatment_list)
    df_res['treatment_group_key'] = treatment_list

    # generate features and labels
    X1, Y1 = make_classification(n_samples=n_all, n_features=n_classification_features,
                                 n_informative=n_classification_informative, n_redundant=n_classification_redundant,
                                 n_repeated=n_classification_repeated, n_clusters_per_class=1,
                                 weights=[1-positive_class_proportion, positive_class_proportion])

    x_name = []
    x_informative_name = []
    for xi in range(n_classification_informative):
        x_name_i = 'x' + str(len(x_name)+1) + '_informative'
        x_name.append(x_name_i)
        x_informative_name.append(x_name_i)
        df_res[x_name_i] = X1[:, xi]
    for xi in range(n_classification_redundant):
        x_name_i = 'x' + str(len(x_name)+1) + '_redundant'
        x_name.append(x_name_i)
        df_res[x_name_i] = X1[:, n_classification_informative+xi]
    for xi in range(n_classification_repeated):
        x_name_i = 'x' + str(len(x_name)+1) + '_repeated'
        x_name.append(x_name_i)
        df_res[x_name_i] = X1[:, n_classification_informative+n_classification_redundant+xi]

    for xi in range(n_classification_features - n_classification_informative - n_classification_redundant
                    - n_classification_repeated):
        x_name_i = 'x' + str(len(x_name)+1) + '_irrelevant'
        x_name.append(x_name_i)
        df_res[x_name_i] = np.random.normal(0, 1, n_all)

    # default treatment effects
    Y = Y1.copy()
    Y_increase = np.zeros_like(Y1)
    Y_decrease = np.zeros_like(Y1)

    # generate uplift (positive)
    for treatment_key_i in treatment_name:
        treatment_index = df_res.index[df_res['treatment_group_key'] == treatment_key_i].tolist()
        if treatment_key_i in n_uplift_increase_dict and n_uplift_increase_dict[treatment_key_i] > 0:
            x_uplift_increase_name = []
            adjust_class_proportion = (delta_uplift_increase_dict[treatment_key_i]) / (1-positive_class_proportion)
            X_increase, Y_increase = make_classification(n_samples=n_all,
                                                         n_features=n_uplift_increase_dict[treatment_key_i],
                                                         n_informative=n_uplift_increase_dict[treatment_key_i],
                                                         n_redundant=0,
                                                         n_clusters_per_class=1,
                                                         weights=[1-adjust_class_proportion, adjust_class_proportion])
            for xi in range(n_uplift_increase_dict[treatment_key_i]):
                x_name_i = 'x' + str(len(x_name)+1) + '_uplift_increase'
                x_name.append(x_name_i)
                x_uplift_increase_name.append(x_name_i)
                df_res[x_name_i] = X_increase[:, xi]
            Y[treatment_index] = Y[treatment_index] + Y_increase[treatment_index]
            if n_uplift_increase_mix_informative_dict[treatment_key_i] > 0:
                for xi in range(n_uplift_increase_mix_informative_dict[treatment_key_i]):
                    x_name_i = 'x' + str(len(x_name)+1) + '_increase_mix'
                    x_name.append(x_name_i)
                    df_res[x_name_i] = (np.random.uniform(-1, 1) * df_res[np.random.choice(x_informative_name)]
                                        + np.random.uniform(-1, 1) * df_res[np.random.choice(x_uplift_increase_name)])

    # generate uplift (negative)
    for treatment_key_i in treatment_name:
        treatment_index = df_res.index[df_res['treatment_group_key'] == treatment_key_i].tolist()
        if treatment_key_i in n_uplift_decrease_dict and n_uplift_decrease_dict[treatment_key_i] > 0:
            x_uplift_decrease_name = []
            adjust_class_proportion = (delta_uplift_decrease_dict[treatment_key_i]) / (1-positive_class_proportion)
            X_decrease, Y_decrease = make_classification(n_samples=n_all,
                                                         n_features=n_uplift_decrease_dict[treatment_key_i],
                                                         n_informative=n_uplift_decrease_dict[treatment_key_i],
                                                         n_redundant=0,
                                                         n_clusters_per_class=1,
                                                         weights=[1-adjust_class_proportion, adjust_class_proportion])
            for xi in range(n_uplift_decrease_dict[treatment_key_i]):
                x_name_i = 'x' + str(len(x_name)+1) + '_uplift_decrease'
                x_name.append(x_name_i)
                x_uplift_decrease_name.append(x_name_i)
                df_res[x_name_i] = X_decrease[:, xi]
            Y[treatment_index] = Y[treatment_index] - Y_decrease[treatment_index]
            if n_uplift_decrease_mix_informative_dict[treatment_key_i] > 0:
                for xi in range(n_uplift_decrease_mix_informative_dict[treatment_key_i]):
                    x_name_i = 'x' + str(len(x_name)+1) + '_decrease_mix'
                    x_name.append(x_name_i)
                    df_res[x_name_i] = (np.random.uniform(-1, 1) * df_res[np.random.choice(x_informative_name)]
                                        + np.random.uniform(-1, 1) * df_res[np.random.choice(x_uplift_decrease_name)])

    # truncate Y
    Y = np.clip(Y, 0, 1)

    df_res[y_name] = Y
    df_res['treatment_effect'] = Y - Y1
    return df_res, x_name