Source code for ananke.datasets.helpers

"""
Helper functions that help load datasets in Ananke.
"""

import os

import pandas as pd

MODULE_PATH = os.path.dirname(__file__)


[docs]def load_conditionally_ignorable_data():
    """
    Load toy data for the conditionally ignorable model
    where the confounder is Viral Load, T is the treatment
    and the outcome is CD4 counts.

    :return: pandas dataframe.
    """

    path = os.path.join(MODULE_PATH, "simulated/conditionally_ignorable.csv")
    return pd.read_csv(path)


[docs]def load_afixable_data():
    """
    Load toy data for an adjustment fixable setting
    where T is the treatment and the outcome is CD4 counts.

    :return: pandas dataframe.
    """

    path = os.path.join(MODULE_PATH, "simulated/a_fixable.csv")
    return pd.read_csv(path)


[docs]def load_frontdoor_data():
    """
    Load toy data for frontdoor setting
    where T is the treatment and the outcome is CD4 counts.

    :return: pandas dataframe.
    """

    path = os.path.join(MODULE_PATH, "simulated/frontdoor.csv")
    return pd.read_csv(path)


[docs]def load_wisconsin_health_study():
    """
    Load the dataset extract from the Wisconsin Health Study presented in [1].
    Columns are defined as follows:
    X: an indicator of whether family income in 1957 was above \$5k;
    Y: an indicator of whether the respondents income in 1992 was above \$37k;
    M: an indicator of whether the respondent was drafted into the military;
    E: an indicator of whether the respondent had education beyond high school.
    count: the count of each event in (X, Y, M, E)

    [1] R. J. Evans and T. S. Richardson, “Smooth, identifiable supermodels of discrete DAG models with latent variables,” Bernoulli, vol. 25, no. 2, pp. 848–876, May 2019, doi: 10.3150/17-BEJ1005.


    """
    path = os.path.join(MODULE_PATH, "real/evans_richardson_wisconsin.csv")
    return pd.read_csv(path)