Source code for getml.datasets.base

# Copyright 2022 The SQLNet Company GmbH
#
# This file is licensed under the Elastic License 2.0 (ELv2).
# Refer to the LICENSE.txt file in the root of the repository
# for details.
#

"""
Load preprocessed datasets
"""

import json
import warnings
from typing import Dict, List, Optional, Tuple, Union
from urllib.request import urlopen  # type: ignore

import pandas as pd  # type: ignore

from getml.data import DataFrame

DataFrameT = Union[DataFrame, pd.DataFrame]

VERSION: str = "v1"

BUCKET: str = "https://static.getml.com/datasets"
"""S3 bucket containing the CSV files"""

ASSETS: Dict[str, List[str]] = {
    "air_pollution": ["population"],
    "atherosclerosis": ["population", "contr"],
    "biodegradability": [
        "molecule_train",
        "molecule_test",
        "molecule_validation",
        "atom",
        "bond",
        "gmember",
        "group",
    ],
    "consumer_expenditures": [
        "population_training",
        "population_testing",
        "population_validation",
        "expd",
        "fmld",
        "memd",
    ],
    "interstate94": ["traffic"],
    "loans": [
        "population_train",
        "population_test",
        "order",
        "trans",
        "meta",
    ],
    "loans_new": [
        "account",
        "card",
        "client",
        "disp",
        "district",
        "loan",
        "order",
        "trans",
    ],
    "occupancy": ["population_train", "population_test", "population_validation"],
}


def _load_dataset(
    ds_name: str,
    assets: Optional[List[str]] = None,
    roles: bool = False,
    units: bool = False,
    as_pandas: bool = False,
    as_dict: bool = False,
) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]:
    """Helper function to load a dataset

    Args:
        ds_name (str):
            name of the dataset

        assets (list):
            CSV files to be loaded from the S3 bucket

        roles (bool):
            Return getml.DataFrame with roles set

        units (bool):
            Return getml.DataFrame with units set

        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        as_dict (bool):
            Return data as dict with `df.name`s as keys and
            `df` as values.

    Returns:
        tuple:
            Tuple containing (sorted alphabetically by `df.name`s) the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) or

        dict:
            dict containing the with (`df.name`s as keys and `df`s as values) as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) if `as_dict` is `True`.
    """
    base = f"{BUCKET}/{VERSION}/{ds_name}/preprocessed"

    if assets is None:
        assets = ASSETS[ds_name]

    if roles:
        filename = base + "/" + ds_name + "_" + "roles.json"
        response = urlopen(filename)
        roles_ = json.loads(response.read())
    else:
        roles_ = {}

    dfs = {}
    for asset in assets:
        filename = base + "/" + ds_name + "_" + asset + ".csv"
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print()
            print(f"Loading {asset}...")
            dfs[asset] = DataFrame.from_csv(
                name=asset,
                fnames=filename,
                roles=roles_.get(asset),
                verbose=False,
            )

    if units:
        filename = base + "/" + ds_name + "_" + "units.json"
        response = urlopen(filename)
        units_ = json.loads(response.read())

        for key, df in dfs.items():
            for unit, columns in units_.get(key, {}).items():
                if unit:
                    df.set_unit(columns, unit)

    if as_pandas:
        dfs = {key: df.to_pandas() for key, df in dfs.items()}

    if as_dict:
        return dfs

    return tuple(dfs.values())


[docs]def load_air_pollution(
    roles: bool = True,
    as_pandas: bool = False,
) -> Union[DataFrameT]:
    """
    Regression dataset on air pollution in Beijing, China

    The dataset consits of a single table split into train and test sets
    arround 2014-01-01.

    The orgininal publication is:
    Liang, X., Zou, T., Guo, B., Li, S., Zhang, H., Zhang, S., Huang, H. and
    Chen, S. X. (2015). Assessing Beijing's PM2.5 pollution: severity, weather
    impact, APEC and winter heating. Proceedings of the Royal Society A, 471,
    20150257.

    Args:
        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        roles (bool):
            Return data with roles set

    Returns:
        getml.data.DataFrame:
            A DataFrame holding the data described above.

        The following DataFrames are returned:

            * air_pollution

    Examples:
        >>> air_pollution = getml.datasets.load_air_pollution()
        >>> type(air_pollution)
        ... getml.data.data_frame.DataFrame

        For an full analysis of the atherosclerosis dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/atherosclerosis>`_.


    Note:
        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "air_pollution"

    dataset = _load_dataset(
        ds_name=ds_name,
        roles=roles,
        as_pandas=as_pandas,
    )
    assert isinstance(dataset, tuple), "Expected a tuple"
    return dataset[0]


[docs]def load_atherosclerosis(
    roles: bool = True,
    as_pandas: bool = False,
    as_dict: bool = False,
) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]:
    """
    Binary classification dataset on the lethality of atherosclerosis

    The atherosclerosis dataset is a medical dataset from the the `CTU Prague
    Relational Learning Repository
    <https://relational.fit.cvut.cz/dataset/Atherosclerosis>`_. It contains
    information from an longitudal study on 1417 middle-aged men obeserved over
    the course of 20 years. After preprocessing, it consists of 2 tables with 76
    and 66 columns:

    * `population`: Data on the study's participants

    * `contr`: Data on control dates

    The population table is split into a training (70%), a testing (15%) set and a
    validation (15%) set.

    Args:
        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        roles (bool):
            Return data with roles set

        as_dict (bool):
            Return data as dict with `df.name` s as keys and
            `df` s as values.

    Returns:
        tuple:
            Tuple containing (sorted alphabetically by `df.name`s) the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) or
        dict:
            if `as_dict` is `True`: Dictionary containing the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True). The keys correspond to the name of the DataFrame on the
            :mod:`~getml.engine`.

        The following DataFrames are returned:

            * population
            * contr

    Examples:
        >>> population, contr = getml.datasets.load_atherosclerosis()
        >>> type(population)
        ... getml.data.data_frame.DataFrame

        For an full analysis of the atherosclerosis dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/atherosclerosis>`_.


    Note:
        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "atherosclerosis"

    return _load_dataset(
        ds_name=ds_name,
        roles=roles,
        as_pandas=as_pandas,
        as_dict=as_dict,
    )


[docs]def load_biodegradability(
    roles: bool = True,
    as_pandas: bool = False,
    as_dict: bool = False,
) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]:
    """
    Regression dataset on molecule weight prediction

    The QSAR biodegradation dataset was built in the Milano Chemometrics and
    QSAR Research Group (Universita degli Studi Milano-Bicocca, Milano, Italy).
    The data have been used to develop QSAR (Quantitative Structure Activity
    Relationships) models for the study of the relationships between chemical
    structure and biodegradation of molecules. Biodegradation experimental
    values of 1055 chemicals were collected from the webpage of the National
    Institute of Technology and Evaluation of Japan (NITE).

    The orgininal publication is:
    Mansouri, K., Ringsted, T., Ballabio, D., Todeschini, R., Consonni, V.
    (2013). Quantitative Structure - Activity Relationship models for ready
    biodegradability of chemicals. Journal of Chemical Information and Modeling,
    53, 867-878

    The dataset was collected through the `UCI Machine Learning Repository`
    <https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation>

    It contains information on 1309 molecules with 6166 bonds. It consists of 5
    tables.

    The population table is split into a training (50 %) and a testing (25%) and
    validition (25%) sets.

    Args:
        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        roles (bool):
            Return data with roles set

        as_dict (bool):
            Return data as dict with `df.name` s as keys and
            `df` s as values.

    Returns:
        tuple:
            Tuple containing (sorted alphabetically by `df.name`s) the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) or
        dict:
            if `as_dict` is `True`: Dictionary containing the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True). The keys correspond to the name of the DataFrame on the
            :mod:`~getml.engine`.

        The following DataFrames are returned:

            * molecule
            * atom
            * bond
            * gmember
            * group

    Examples:
        >>> biodegradability = getml.datasets.load_biodegradability(as_dict=True)
        >>> type(biodegradability["molecule_train"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the biodegradability dataset including all necessary
        preprocessing steps please refer to getml-examples (forthcoming).

    Note:
        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "biodegradability"

    return _load_dataset(
        ds_name=ds_name,
        roles=roles,
        as_pandas=as_pandas,
        as_dict=as_dict,
    )


[docs]def load_consumer_expenditures(
    roles: bool = True,
    units: bool = True,
    as_pandas: bool = False,
    as_dict: bool = False,
) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]:
    """
    Binary classification dataset on consumer expenditures

    The Consumer Expenditure Data Set is a public domain data set provided by
    the American Bureau of Labor Statistics (https://www.bls.gov/cex/pumd.htm).
    It includes the diary entries, where American consumers are asked to keep
    diaries of the products they have purchased each month,

    We use this dataset to classify wether an item was pruchased as a gift.

    Args:
        roles (bool):
            Return data with roles set

        units (bool):
            Return data with units set

        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        as_dict (bool):
            Return data as dict with `df.name` s as keys and
            `df` s as values.

    Returns:
        tuple:
            Tuple containing (sorted alphabetically by `df.name`s) the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) or
        dict:
            if `as_dict` is `True`: Dictionary containing the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True). The keys correspond to the name of the DataFrame on the
            :mod:`~getml.engine`.

        The following DataFrames are returned:

            * population
            * expd
            * fmld
            * memd

    Examples:
        >>> ce = getml.datasets.load_consumer_expenditures(as_dict=True)
        >>> type(ce["expd"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the occupancy dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/consumer_expenditures>`_.

    Note:
        Roles and units can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`.`
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "consumer_expenditures"

    return _load_dataset(
        ds_name=ds_name,
        roles=roles,
        units=units,
        as_pandas=as_pandas,
        as_dict=as_dict,
    )


[docs]def load_interstate94(
    roles: bool = True,
    units: bool = True,
    as_pandas: bool = False,
) -> Union[DataFrameT]:
    """
    Regression dataset on traffic volume predicition

    The interstate94 dataset is a multivariate time series containing the
    hourly traffic volume on I-94 westbound from Minneapolis-St Paul. It is
    based on data provided by the `MN Department of Transportation
    <https://www.dot.state.mn.us/>`_. Some additional data preparation done by
    `John Hogue <https://github.com/dreyco676/Anomaly_Detection_A_to_Z/>`_. The
    dataset features some particular interesting characteristics common for
    time series, which classical models may struggle to appropriately deal
    with. Such characteristics are:

    * High frequency (hourly)
    * Dependence on irregular events (holidays)
    * Strong and overlapping cycles (daily, weekly)
    * Annomalies
    * Multiple seasonalities

    Args:
        roles (bool):
            Return data with roles set

        units (bool):
            Return data with units set

        as_pandas (bool):
            Return data as `pandas.DataFrame` s

    Returns:
        getml.data.DataFrame:
            A DataFrame holding the data described above.

        The following DataFrames are returned:

            * traffic

    Examples:
        >>> traffic = getml.datasets.load_interstate94()
        >>> type(traffic)
        ... getml.data.data_frame.DataFrame

        For an full analysis of the interstate94 dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/interstate94>`_.

    Note:
        Roles and units can be set ad-hoc by supplying the respective flags. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. Before using them in an
        analysis, a data model needs to be constructed using
        :class:`~getml.data.Placeholder` s.
    """

    ds_name = "interstate94"
    dataset = _load_dataset(
        ds_name=ds_name,
        roles=roles,
        units=units,
        as_pandas=as_pandas,
    )
    assert isinstance(dataset, tuple), "Expected a tuple"
    return dataset[0]


[docs]def load_loans(
    roles: bool = True,
    units: bool = True,
    as_pandas: bool = False,
    as_dict: bool = False,
) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]:
    """
    Binary classification dataset on loan default

    The loans dataset is based on financial dataset from the the `CTU Prague
    Relational Learning Repository
    <https://relational.fit.cvut.cz/dataset/Financial>`_.

    The original publication is:
    Berka, Petr (1999). Workshop notes on Discovery Challange PKDD'99.

    The dataset contains information on 606 successful and 76 unsuccessful
    loans. After some preprocessing it contains 5 tables

    * `account`: Information about the borrower(s) of a given loan.

    * `loan`: Information about the loans themselves, such as the date of creation, the amount, and the planned duration of the loan. The target variable is the status of the loan (default/no default)

    * `meta`: Meta information about the obligor, such as gender and geo-information

    * `order`: Information about permanent orders, debited payments and account balances.

    * `trans`: Information about transactions and accounts balances.

    The population table is split into a training and a testing set at 80% of the main population.

    Args:
        roles (bool):
            Return data with roles set

        units (bool):
            Return data with units set

        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        as_dict (bool):
            Return data as dict with `df.name` s as keys and
            `df` s as values.

    Returns:
        tuple:
            Tuple containing (sorted alphabetically by `df.name`s) the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) or
        dict:
            if `as_dict` is `True`: Dictionary containing the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True). The keys correspond to the name of the DataFrame on the
            :mod:`~getml.engine`.

        The following DataFrames are returned:

            * account
            * loan
            * meta
            * order
            * trans

    Examples:
        >>> loans = getml.datasets.load_loans(as_dict=True)
        >>> type(loans["population_train"])
        ... getml.data.data_frame.DataFrame

        For an full analysis of the loans dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/loans>`_.

    Note:
        Roles and units can be set ad-hoc by supplying the respective flags. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. Before using them in an
        analysis, a data model needs to be constructed using
        :class:`~getml.data.Placeholder` s.
    """

    ds_name = "loans"

    return _load_dataset(
        ds_name=ds_name,
        roles=roles,
        units=units,
        as_pandas=as_pandas,
        as_dict=as_dict,
    )


[docs]def load_occupancy(
    roles: bool = True,
    as_pandas: bool = False,
    as_dict: bool = False,
) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]:
    """
    Binary classification dataset on occupancy detection

    The occupancy detection data set is a very simple multivariate time series
    from the `UCI Machine Learning Repository
    <https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+>`_. It is a
    binary classification problem. The task is to predict room occupancy
    from Temperature, Humidity, Light and CO2.

    The original publication is:
    Candanedo, L. M., & Feldheim, V. (2016). Accurate occupancy detection of an
    office room from light, temperature, humidity and CO2 measurements using
    statistical learning models. Energy and Buildings, 112, 28-39.

    Args:
        roles (bool):
            Return data with roles set

        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        as_dict (bool):
            Return data as dict with `df.name` s as keys and
            `df` s as values.

    Returns:
        tuple:
            Tuple containing (sorted alphabetically by `df.name`s) the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) or
        dict:
            if `as_dict` is `True`: Dictionary containing the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True). The keys correspond to the name of the DataFrame on the
            :mod:`~getml.engine`.

        The following DataFrames are returned:

            * population_train
            * population_test
            * population_validation

    Examples:
        >>> population_train, population_test, _ = getml.datasets.load_occupancy()
        >>> type(occupancy_train)
        ... getml.data.data_frame.DataFrame

        For an full analysis of the occupancy dataset including all necessary
        preprocessing steps please refer to `getml-examples
        <https://github.com/getml/getml-examples/tree/master/occupancy>`_.


    Note:
        Roles can be set ad-hoc by supplying the respective flag. If
        `roles` is `False`, all columns in the returned
        :class:`~getml.DataFrames` s have roles
        :const:`~getml.data.roles.unused_string` or
        :const:`~getml.data.roles.unused_float`. This dataset contains no units.
        Before using them in an analysis, a data model needs to be constructed
        using :class:`~getml.data.Placeholder` s.
    """

    ds_name = "occupancy"

    return _load_dataset(
        ds_name=ds_name,
        roles=roles,
        as_pandas=as_pandas,
        as_dict=as_dict,
    )