Source code for causalml.metrics.regression

import logging
import numpy as np
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae  # noqa
from sklearn.metrics import r2_score  # noqa

from .const import EPS


logger = logging.getLogger("causalml")


[docs]def ape(y, p):
    """Absolute Percentage Error (APE).
    Args:
        y (float): target
        p (float): prediction

    Returns:
        e (float): APE
    """

    assert np.abs(y) > EPS
    return np.abs(1 - p / y)


[docs]def mape(y, p):
    """Mean Absolute Percentage Error (MAPE).
    Args:
        y (numpy.array): target
        p (numpy.array): prediction

    Returns:
        e (numpy.float64): MAPE
    """

    filt = np.abs(y) > EPS
    return np.mean(np.abs(1 - p[filt] / y[filt]))


[docs]def smape(y, p):
    """Symmetric Mean Absolute Percentage Error (sMAPE).
    Args:
        y (numpy.array): target
        p (numpy.array): prediction

    Returns:
        e (numpy.float64): sMAPE
    """
    return 2.0 * np.mean(np.abs(y - p) / (np.abs(y) + np.abs(p)))


[docs]def rmse(y, p):
    """Root Mean Squared Error (RMSE).
    Args:
        y (numpy.array): target
        p (numpy.array): prediction

    Returns:
        e (numpy.float64): RMSE
    """

    # check and get number of samples
    assert y.shape == p.shape

    return np.sqrt(mse(y, p))


[docs]def gini(y, p):
    """Normalized Gini Coefficient.

    Args:
        y (numpy.array): target
        p (numpy.array): prediction

    Returns:
        e (numpy.float64): normalized Gini coefficient
    """

    # check and get number of samples
    assert y.shape == p.shape

    n_samples = y.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y, p]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    l_true = np.cumsum(true_order) / np.sum(true_order)
    l_pred = np.cumsum(pred_order) / np.sum(pred_order)
    l_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    g_true = np.sum(l_ones - l_true)
    g_pred = np.sum(l_ones - l_pred)

    # normalize to true Gini coefficient
    return g_pred / g_true


[docs]def regression_metrics(
    y, p, w=None, metrics={"RMSE": rmse, "sMAPE": smape, "Gini": gini}
):
    """Log metrics for regressors.

    Args:
        y (numpy.array): target
        p (numpy.array): prediction
        w (numpy.array, optional): a treatment vector (1 or True: treatment, 0 or False: control). If given, log
            metrics for the treatment and control group separately
        metrics (dict, optional): a dictionary of the metric names and functions
    """
    assert metrics
    assert y.shape[0] == p.shape[0]

    for name, func in metrics.items():
        if w is not None:
            assert y.shape[0] == w.shape[0]
            if w.dtype != bool:
                w = w == 1
            logger.info("{:>8s}   (Control): {:10.4f}".format(name, func(y[~w], p[~w])))
            logger.info("{:>8s} (Treatment): {:10.4f}".format(name, func(y[w], p[w])))
        else:
            logger.info("{:>8s}: {:10.4f}".format(name, func(y, p)))