Source code for getml.datasets.base

"""
Load preprocessed datasets
"""

import pandas as pd
import warnings
from urllib.request import urlopen, HTTPError
import json

from pathlib import Path
from getml.log import logger
from getml.data import DataFrame

BUCKET = "https://static.getml.com/datasets/"
"""S3 bucket containing the CSV files"""


def _load_dataset(ds_name, assets, roles, units, as_pandas):
    """Helper function to load a dataset

    Args:
        ds_name (str): name of the dataset
        assets (list): CSV files to be loaded from the S3 bucket
        roles (bool): Return getml.DataFrame with roles set
        units (bool): Return getml.DataFrame with units set
        as_pandas (bool): Return data as `pandas.DataFrame` s

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True).
    """
    base = BUCKET + ds_name
    base += "/preprocessed"

    df_pandas = dict()
    for ass_ in assets:
        filename = base + "/" + ds_name + "_" + ass_ + ".csv"
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            df_pandas[ass_] = pd.read_csv(filename)

    if as_pandas:
        return df_pandas

    if roles:
        filename = base + "/" + ds_name + "_" + "roles.json"
        try:
            json_url = urlopen(filename)
            roles = json.loads(json_url.read())
        except HTTPError:
            raise Exception("Information on roles could not be retrieved.")
    else:
        roles = None

    if units:
        filename = base + "/" + ds_name + "_" + "units.json"
        try:
            json_url = urlopen(filename)
            units = json.loads(json_url.read())
        except HTTPError:
            raise Exception("Information on units could not be retrieved.")
    else:
        units = None

    df_getml = dict()
    for key, value in df_pandas.items():
        r_ = None if roles is None else roles.get(key)
        u_ = None if units is None else units.get(key)

        # Parse pd.Timestamps
        if r_ is not None and "time_stamp" in r_.keys():
            for t_ in r_["time_stamp"]:
                value[t_] = pd.to_datetime(value[t_])

        df_getml[key] = DataFrame.from_pandas(value, name=key, roles=r_)

        # Set units
        if u_ is not None:
            for unit, columns in u_.items():
                df_getml[key].set_unit(columns, unit)

    return df_getml


[docs]def load_air_pollution(roles=False, as_pandas=False):
    """Regression dataset on air pollution in Beijing, China

    The dataset consits of a single table split into train and test sets
    arround 2014-01-01.

    The orgininal publication is:
    Liang, X., Zou, T., Guo, B., Li, S., Zhang, H., Zhang, S., Huang, H. and
    Chen, S. X. (2015). Assessing Beijing's PM2.5 pollution: severity, weather
    impact, APEC and winter heating. Proceedings of the Royal Society A, 471,
    20150257.

    Args:
        as_pandas (bool):

            Return data as `pandas.DataFrame` s

        roles (bool):

            Return data with roles set

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond
            to the name of the DataFrame on the :mod:`~getml.engine`. The following
            DataFrames are contained in the dictionary

            * train
            * test

    Examples:

        >>> df_getml = getml.datasets.load_air_pollution()
        >>> type(df_getml["test"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the atherosclerosis dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/atherosclerosis>`_.


    Note:

        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.data.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "air_pollution"
    assets = ["train", "test"]

    units = False

    return _load_dataset(ds_name, assets, roles, units, as_pandas)


[docs]def load_atherosclerosis(roles=False, as_pandas=False):
    """Binary classification dataset on the lethality of atherosclerosis

    The atherosclerosis dataset is a medical dataset from the the `CTU Prague
    Relational Learning Repository
    <https://relational.fit.cvut.cz/dataset/Atherosclerosis>`_. It contains
    information from an longitudal study on 1417 middle-aged men obeserved over
    the course of 20 years. After preprocessing, it consists of 2 tables with 76
    and 66 columns:

    * `population`: Data on the study's participants

    * `contr`: Data on control dates

    The population table is split into a training (70%), a testing (15%) set and a
    validation (15%) set.

    Args:
        as_pandas (bool):

            Return data as `pandas.DataFrame` s

        roles (bool):

            Return data with roles set

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond
            to the name of the DataFrame on the :mod:`~getml.engine`. The following
            DataFrames are contained in the dictionary

            * population_train
            * population_test
            * population_validation
            * contr

    Examples:

        >>> df_getml = getml.datasets.load_atherosclerosis()
        >>> type(df_getml["population_train"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the atherosclerosis dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/atherosclerosis>`_.


    Note:

        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.data.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "atherosclerosis"
    assets = ["population_train", "population_test",
              "population_validation", "contr"]

    units = False

    return _load_dataset(ds_name, assets, roles, units, as_pandas)


[docs]def load_biodegradability(roles=False, as_pandas=False):
    """Regression dataset on molecule weight prediction

    The QSAR biodegradation dataset was built in the Milano Chemometrics and
    QSAR Research Group (Universita degli Studi Milano-Bicocca, Milano, Italy).
    The data have been used to develop QSAR (Quantitative Structure Activity
    Relationships) models for the study of the relationships between chemical
    structure and biodegradation of molecules. Biodegradation experimental
    values of 1055 chemicals were collected from the webpage of the National
    Institute of Technology and Evaluation of Japan (NITE).

    The orgininal publication is:
    Mansouri, K., Ringsted, T., Ballabio, D., Todeschini, R., Consonni, V.
    (2013). Quantitative Structure - Activity Relationship models for ready
    biodegradability of chemicals. Journal of Chemical Information and Modeling,
    53, 867-878

    The dataset was collected through the `UCI Machine Learning Repository`
    <https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation>

    It contains information on 1309 molecules with 6166 bonds. It consists of 5
    tables.

    The population table is split into a training (50 %) and a testing (25%) and
    validition (25%) sets.

    Args:
        as_pandas (bool):

            Return data as `pandas.DataFrame` s

        roles (bool):

            Return data with roles set

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond
            to the name of the DataFrame on the :mod:`~getml.engine`. The following
            DataFrames are contained in the dictionary

            * molecule_train
            * molecule_test
            * molecule_validation
            * atom
            * bond
            * gmember
            * group

    Examples:

        >>> df_getml = getml.datasets.load_biodegradability()
        >>> type(df_getml["molecule_train"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the biodegradability dataset including all necessary
        preprocessing steps please refer to getml-examples (forthcoming).

    Note:

        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.data.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "biodegradability"
    assets = ["molecule_train", "molecule_test", "molecule_validation", "atom", "bond",
              "gmember", "group"]

    units = False

    return _load_dataset(ds_name, assets, roles, units, as_pandas)


[docs]def load_consumer_expenditures(roles=False, units=False, as_pandas=False):
    """Binary classification dataset on consumer expenditures

    The Consumer Expenditure Data Set is a public domain data set provided by
    the American Bureau of Labor Statistics (https://www.bls.gov/cex/pumd.htm).
    It includes the diary entries, where American consumers are asked to keep
    diaries of the products they have purchased each month,

    We use this dataset to classify wether an item was pruchased as a gift.

    Args:
        as_pandas (bool):

            Return data as `pandas.DataFrame` s

        roles (bool):

            Return data with roles set

        units (bool):

            Return data with units set

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond
            to the name of the DataFrame on the :mod:`~getml.engine`. The following
            DataFrames are contained in the dictionary

            * population_testing
            * population_training,
            * population_validation
            * expd
            * fmld
            * memd

    Examples:

        >>> df_getml = getml.datasets.load_consumer_expenditures()
        >>> type(df_getml["expd"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the occupancy dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/consumer_expenditures>`_.


    Note:

        Roles and units can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.data.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`.`
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "consumer_expenditures"
    assets = ["population_testing", "population_training",
              "population_validation", "expd", "fmld", "memd"]
    return _load_dataset(ds_name, assets, roles, units, as_pandas)


[docs]def load_interstate94(roles=False, units=False, as_pandas=False):
    """Regression dataset on traffic volume predicition

    The interstate94 dataset is a multivariate time series containing the
    hourly traffic volume on I-94 westbound from Minneapolis-St Paul. It is
    based on data provided by the `MN Department of Transportation
    <https://www.dot.state.mn.us/>`_. Some additional data preparation done by
    `John Hogue <https://github.com/dreyco676/Anomaly_Detection_A_to_Z/>`_. The
    dataset features some particular interesting characteristics common for
    time series, which classical models may struggle to appropriately deal
    with. Such characteristics are:

    * High frequency (hourly)
    * Dependence on irregular events (holidays)
    * Strong and overlapping cycles (daily, weekly)
    * Annomalies
    * Multiple seasonalities

    Args:
        as_pandas (bool):

            Return data as `pandas.DataFrame` s

        roles (bool):

            Return data with roles set

        units (bool):

            Return data with units set

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond
            to the name of the DataFrame on the :mod:`~getml.engine`. The following
            DataFrames are contained in the dictionary

            * train
            * test
            * weather

    Examples:

        >>> df_getml = getml.datasets.load_interstate94()
        >>> type(df_getml["traffic_train"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the interstate94 dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/interstate94>`_.

    Note:

        Roles and units can be set ad-hoc by supplying the respective flags. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.data.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. Before using them in an
        analysis, a data model needs to be constructed using
        :class:`~getml.data.Placeholder` s.
    """

    ds_name = "interstate94"
    assets = ["traffic_test", "traffic_train", "weather"]

    return _load_dataset(ds_name, assets, roles, units, as_pandas)


[docs]def load_loans(roles=False, units=False, as_pandas=False):
    """Binary classification dataset on loan default

    The loans dataset is based on financial dataset from the the `CTU Prague
    Relational Learning Repository
    <https://relational.fit.cvut.cz/dataset/Financial>`_.

    The original publication is:
    Berka, Petr (1999). Workshop notes on Discovery Challange PKDD'99.

    The dataset contains information on 606 successful and 76 unsuccessful
    loans. After some preprocessing it contains 4 tables

    * `population`: Information about the loans themselves, such as the date of creation, the amount, and the planned duration of the loan. The target variable is the status of the loan (default/no default)

    * `order`: Information about permanent orders, debited payments and account balances.

    * `trans`: Information about transactions and accounts balances.

    * `meta`: Meta information about the obligor, such as gender and geo-information

    The population table is split into a training and a testing set at 80% of the main population.

    Args:
        as_pandas (bool):

            Return data as `pandas.DataFrame` s

        roles (bool):

            Return data with roles set

        units (bool):

            Return data with units set

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond
            to the name of the DataFrame on the :mod:`~getml.engine`. The following
            DataFrames are contained in the dictionary

            * population_train
            * population_test
            * order
            * trans
            * meta

    Examples:

        >>> df_getml = getml.datasets.load_loans()
        >>> type(df_getml["population_train"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the loans dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/loans>`_.

    Note:

        Roles and units can be set ad-hoc by supplying the respective flags. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.data.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. Before using them in an
        analysis, a data model needs to be constructed using
        :class:`~getml.data.Placeholder` s.
    """

    ds_name = "loans"
    assets = ["population_train", "population_test", "order", "trans", "meta"]

    return _load_dataset(ds_name, assets, roles, units, as_pandas)


[docs]def load_occupancy(roles=False, as_pandas=False):
    """Binary classification dataset on occupancy detection

    The occupancy detection data set is a very simple multivariate time series
    from the `UCI Machine Learning Repository
    <https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+>`_. It is a 
    binary classification problem. The task is to predict room occupancy 
    from Temperature, Humidity, Light and CO2.

    The original publication is:
    Candanedo, L. M., & Feldheim, V. (2016). Accurate occupancy detection of an
    office room from light, temperature, humidity and CO2 measurements using
    statistical learning models. Energy and Buildings, 112, 28-39.

    Args:
        as_pandas (bool):

            Return data as `pandas.DataFrame` s

        roles (bool):

            Return data with roles set

    Returns:
        dict:

            Dictionary containing the data as :class:`~getml.data.DataFrame` s or
            :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond
            to the name of the DataFrame on the :mod:`~getml.engine`. The following
            DataFrames are contained in the dictionary

            * train
            * validate
            * test

    Examples:

        >>> df_getml = getml.datasets.load_occupancy()
        >>> type(df_getml["train"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the occupancy dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/occupancy>`_.


    Note:

        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.data.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "occupancy"
    assets = ["test", "train", "validate"]

    units = False

    return _load_dataset(ds_name, assets, roles, units, as_pandas)