Source code for getml.datasets.base

# Copyright 2022 The SQLNet Company GmbH
#
# This file is licensed under the Elastic License 2.0 (ELv2).
# Refer to the LICENSE.txt file in the root of the repository
# for details.
#

"""
Load preprocessed datasets
"""

import json
import warnings
from typing import Dict, List, Optional, Tuple, Union
from urllib.request import urlopen  # type: ignore

import pandas as pd  # type: ignore

from getml.data import DataFrame

DataFrameT = Union[DataFrame, pd.DataFrame]

VERSION: str = "v1"

BUCKET: str = "https://static.getml.com/datasets"
"""S3 bucket containing the CSV files"""

ASSETS: Dict[str, List[str]] = {
    "air_pollution": ["population"],
    "atherosclerosis": ["population", "contr"],
    "biodegradability": [
        "molecule_train",
        "molecule_test",
        "molecule_validation",
        "atom",
        "bond",
        "gmember",
        "group",
    ],
    "consumer_expenditures": [
        "population_training",
        "population_testing",
        "population_validation",
        "expd",
        "fmld",
        "memd",
    ],
    "interstate94": ["traffic"],
    "loans": [
        "population_train",
        "population_test",
        "order",
        "trans",
        "meta",
    ],
    "loans_new": [
        "account",
        "card",
        "client",
        "disp",
        "district",
        "loan",
        "order",
        "trans",
    ],
    "occupancy": ["population_train", "population_test", "population_validation"],
}


def _load_dataset(
    ds_name: str,
    assets: Optional[List[str]] = None,
    roles: bool = False,
    units: bool = False,
    as_pandas: bool = False,
    as_dict: bool = False,
) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]:
    """Helper function to load a dataset

    Args:
        ds_name (str):
            name of the dataset

        assets (list):
            CSV files to be loaded from the S3 bucket

        roles (bool):
            Return getml.DataFrame with roles set

        units (bool):
            Return getml.DataFrame with units set

        as_pandas (bool):
            Return data as `pandas.DataFrame` s

        as_dict (bool):
            Return data as dict with `df.name`s as keys and
            `df` as values.

    Returns:
        tuple:
            Tuple containing (sorted alphabetically by `df.name`s) the data as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) or

        dict:
            dict containing the with (`df.name`s as keys and `df`s as values) as
            :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas`
            is True) if `as_dict` is `True`.
    """
    base = f"{BUCKET}/{VERSION}/{ds_name}/preprocessed"

    if assets is None:
        assets = ASSETS[ds_name]

    if roles:
        filename = base + "/" + ds_name + "_" + "roles.json"
        response = urlopen(filename)
        roles_ = json.loads(response.read())
    else:
        roles_ = {}

    dfs = {}
    for asset in assets:
        filename = base + "/" + ds_name + "_" + asset + ".csv"
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print()
            print(f"Loading {asset}...")
            dfs[asset] = DataFrame.from_csv(
                name=asset,
                fnames=filename,
                roles=roles_.get(asset),
                verbose=False,
            )

    if units:
        filename = base + "/" + ds_name + "_" + "units.json"
        response = urlopen(filename)
        units_ = json.loads(response.read())

        for key, df in dfs.items():
            for unit, columns in units_.get(key, {}).items():
                if unit:
                    df.set_unit(columns, unit)

    if as_pandas:
        dfs = {key: df.to_pandas() for key, df in dfs.items()}

    if as_dict:
        return dfs

    return tuple(dfs.values())


[docs]def load_air_pollution( roles: bool = True, as_pandas: bool = False, ) -> Union[DataFrameT]: """ Regression dataset on air pollution in Beijing, China The dataset consits of a single table split into train and test sets arround 2014-01-01. The orgininal publication is: Liang, X., Zou, T., Guo, B., Li, S., Zhang, H., Zhang, S., Huang, H. and Chen, S. X. (2015). Assessing Beijing's PM2.5 pollution: severity, weather impact, APEC and winter heating. Proceedings of the Royal Society A, 471, 20150257. Args: as_pandas (bool): Return data as `pandas.DataFrame` s roles (bool): Return data with roles set Returns: getml.data.DataFrame: A DataFrame holding the data described above. The following DataFrames are returned: * air_pollution Examples: >>> air_pollution = getml.datasets.load_air_pollution() >>> type(air_pollution) ... getml.data.data_frame.DataFrame For an full analysis of the atherosclerosis dataset including all necessary preprocessing steps please refer to `getml-examples <https://github.com/getml/getml-examples/tree/master/atherosclerosis>`_. Note: Roles can be set ad-hoc by supplying the respective flag. If `roles` is `False`, all columns in the returned :class:`~getml.DataFrames` s have roles :const:`~getml.data.roles.unused_string` or :const:`~getml.data.roles.unused_float`. This dataset contains no units. Before using them in an analysis, a data model needs to be constructed using :class:`~getml.data.Placeholder` s. """ ds_name = "air_pollution" dataset = _load_dataset( ds_name=ds_name, roles=roles, as_pandas=as_pandas, ) assert isinstance(dataset, tuple), "Expected a tuple" return dataset[0]
[docs]def load_atherosclerosis( roles: bool = True, as_pandas: bool = False, as_dict: bool = False, ) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]: """ Binary classification dataset on the lethality of atherosclerosis The atherosclerosis dataset is a medical dataset from the the `CTU Prague Relational Learning Repository <https://relational.fit.cvut.cz/dataset/Atherosclerosis>`_. It contains information from an longitudal study on 1417 middle-aged men obeserved over the course of 20 years. After preprocessing, it consists of 2 tables with 76 and 66 columns: * `population`: Data on the study's participants * `contr`: Data on control dates The population table is split into a training (70%), a testing (15%) set and a validation (15%) set. Args: as_pandas (bool): Return data as `pandas.DataFrame` s roles (bool): Return data with roles set as_dict (bool): Return data as dict with `df.name` s as keys and `df` s as values. Returns: tuple: Tuple containing (sorted alphabetically by `df.name`s) the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True) or dict: if `as_dict` is `True`: Dictionary containing the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond to the name of the DataFrame on the :mod:`~getml.engine`. The following DataFrames are returned: * population * contr Examples: >>> population, contr = getml.datasets.load_atherosclerosis() >>> type(population) ... getml.data.data_frame.DataFrame For an full analysis of the atherosclerosis dataset including all necessary preprocessing steps please refer to `getml-examples <https://github.com/getml/getml-examples/tree/master/atherosclerosis>`_. Note: Roles can be set ad-hoc by supplying the respective flag. If `roles` is `False`, all columns in the returned :class:`~getml.DataFrames` s have roles :const:`~getml.data.roles.unused_string` or :const:`~getml.data.roles.unused_float`. This dataset contains no units. Before using them in an analysis, a data model needs to be constructed using :class:`~getml.data.Placeholder` s. """ ds_name = "atherosclerosis" return _load_dataset( ds_name=ds_name, roles=roles, as_pandas=as_pandas, as_dict=as_dict, )
[docs]def load_biodegradability( roles: bool = True, as_pandas: bool = False, as_dict: bool = False, ) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]: """ Regression dataset on molecule weight prediction The QSAR biodegradation dataset was built in the Milano Chemometrics and QSAR Research Group (Universita degli Studi Milano-Bicocca, Milano, Italy). The data have been used to develop QSAR (Quantitative Structure Activity Relationships) models for the study of the relationships between chemical structure and biodegradation of molecules. Biodegradation experimental values of 1055 chemicals were collected from the webpage of the National Institute of Technology and Evaluation of Japan (NITE). The orgininal publication is: Mansouri, K., Ringsted, T., Ballabio, D., Todeschini, R., Consonni, V. (2013). Quantitative Structure - Activity Relationship models for ready biodegradability of chemicals. Journal of Chemical Information and Modeling, 53, 867-878 The dataset was collected through the `UCI Machine Learning Repository` <https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation> It contains information on 1309 molecules with 6166 bonds. It consists of 5 tables. The population table is split into a training (50 %) and a testing (25%) and validition (25%) sets. Args: as_pandas (bool): Return data as `pandas.DataFrame` s roles (bool): Return data with roles set as_dict (bool): Return data as dict with `df.name` s as keys and `df` s as values. Returns: tuple: Tuple containing (sorted alphabetically by `df.name`s) the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True) or dict: if `as_dict` is `True`: Dictionary containing the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond to the name of the DataFrame on the :mod:`~getml.engine`. The following DataFrames are returned: * molecule * atom * bond * gmember * group Examples: >>> biodegradability = getml.datasets.load_biodegradability(as_dict=True) >>> type(biodegradability["molecule_train"]) ... getml.data.data_frame.DataFrame For an full analysis of the biodegradability dataset including all necessary preprocessing steps please refer to getml-examples (forthcoming). Note: Roles can be set ad-hoc by supplying the respective flag. If `roles` is `False`, all columns in the returned :class:`~getml.DataFrames` s have roles :const:`~getml.data.roles.unused_string` or :const:`~getml.data.roles.unused_float`. This dataset contains no units. Before using them in an analysis, a data model needs to be constructed using :class:`~getml.data.Placeholder` s. """ ds_name = "biodegradability" return _load_dataset( ds_name=ds_name, roles=roles, as_pandas=as_pandas, as_dict=as_dict, )
[docs]def load_consumer_expenditures( roles: bool = True, units: bool = True, as_pandas: bool = False, as_dict: bool = False, ) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]: """ Binary classification dataset on consumer expenditures The Consumer Expenditure Data Set is a public domain data set provided by the American Bureau of Labor Statistics (https://www.bls.gov/cex/pumd.htm). It includes the diary entries, where American consumers are asked to keep diaries of the products they have purchased each month, We use this dataset to classify wether an item was pruchased as a gift. Args: roles (bool): Return data with roles set units (bool): Return data with units set as_pandas (bool): Return data as `pandas.DataFrame` s as_dict (bool): Return data as dict with `df.name` s as keys and `df` s as values. Returns: tuple: Tuple containing (sorted alphabetically by `df.name`s) the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True) or dict: if `as_dict` is `True`: Dictionary containing the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond to the name of the DataFrame on the :mod:`~getml.engine`. The following DataFrames are returned: * population * expd * fmld * memd Examples: >>> ce = getml.datasets.load_consumer_expenditures(as_dict=True) >>> type(ce["expd"]) ... getml.data.data_frame.DataFrame For an full analysis of the occupancy dataset including all necessary preprocessing steps please refer to `getml-examples <https://github.com/getml/getml-examples/tree/master/consumer_expenditures>`_. Note: Roles and units can be set ad-hoc by supplying the respective flag. If `roles` is `False`, all columns in the returned :class:`~getml.DataFrames` s have roles :const:`~getml.data.roles.unused_string` or :const:`~getml.data.roles.unused_float`.` Before using them in an analysis, a data model needs to be constructed using :class:`~getml.data.Placeholder` s. """ ds_name = "consumer_expenditures" return _load_dataset( ds_name=ds_name, roles=roles, units=units, as_pandas=as_pandas, as_dict=as_dict, )
[docs]def load_interstate94( roles: bool = True, units: bool = True, as_pandas: bool = False, ) -> Union[DataFrameT]: """ Regression dataset on traffic volume predicition The interstate94 dataset is a multivariate time series containing the hourly traffic volume on I-94 westbound from Minneapolis-St Paul. It is based on data provided by the `MN Department of Transportation <https://www.dot.state.mn.us/>`_. Some additional data preparation done by `John Hogue <https://github.com/dreyco676/Anomaly_Detection_A_to_Z/>`_. The dataset features some particular interesting characteristics common for time series, which classical models may struggle to appropriately deal with. Such characteristics are: * High frequency (hourly) * Dependence on irregular events (holidays) * Strong and overlapping cycles (daily, weekly) * Annomalies * Multiple seasonalities Args: roles (bool): Return data with roles set units (bool): Return data with units set as_pandas (bool): Return data as `pandas.DataFrame` s Returns: getml.data.DataFrame: A DataFrame holding the data described above. The following DataFrames are returned: * traffic Examples: >>> traffic = getml.datasets.load_interstate94() >>> type(traffic) ... getml.data.data_frame.DataFrame For an full analysis of the interstate94 dataset including all necessary preprocessing steps please refer to `getml-examples <https://github.com/getml/getml-examples/tree/master/interstate94>`_. Note: Roles and units can be set ad-hoc by supplying the respective flags. If `roles` is `False`, all columns in the returned :class:`~getml.DataFrames` s have roles :const:`~getml.data.roles.unused_string` or :const:`~getml.data.roles.unused_float`. Before using them in an analysis, a data model needs to be constructed using :class:`~getml.data.Placeholder` s. """ ds_name = "interstate94" dataset = _load_dataset( ds_name=ds_name, roles=roles, units=units, as_pandas=as_pandas, ) assert isinstance(dataset, tuple), "Expected a tuple" return dataset[0]
[docs]def load_loans( roles: bool = True, units: bool = True, as_pandas: bool = False, as_dict: bool = False, ) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]: """ Binary classification dataset on loan default The loans dataset is based on financial dataset from the the `CTU Prague Relational Learning Repository <https://relational.fit.cvut.cz/dataset/Financial>`_. The original publication is: Berka, Petr (1999). Workshop notes on Discovery Challange PKDD'99. The dataset contains information on 606 successful and 76 unsuccessful loans. After some preprocessing it contains 5 tables * `account`: Information about the borrower(s) of a given loan. * `loan`: Information about the loans themselves, such as the date of creation, the amount, and the planned duration of the loan. The target variable is the status of the loan (default/no default) * `meta`: Meta information about the obligor, such as gender and geo-information * `order`: Information about permanent orders, debited payments and account balances. * `trans`: Information about transactions and accounts balances. The population table is split into a training and a testing set at 80% of the main population. Args: roles (bool): Return data with roles set units (bool): Return data with units set as_pandas (bool): Return data as `pandas.DataFrame` s as_dict (bool): Return data as dict with `df.name` s as keys and `df` s as values. Returns: tuple: Tuple containing (sorted alphabetically by `df.name`s) the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True) or dict: if `as_dict` is `True`: Dictionary containing the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond to the name of the DataFrame on the :mod:`~getml.engine`. The following DataFrames are returned: * account * loan * meta * order * trans Examples: >>> loans = getml.datasets.load_loans(as_dict=True) >>> type(loans["population_train"]) ... getml.data.data_frame.DataFrame For an full analysis of the loans dataset including all necessary preprocessing steps please refer to `getml-examples <https://github.com/getml/getml-examples/tree/master/loans>`_. Note: Roles and units can be set ad-hoc by supplying the respective flags. If `roles` is `False`, all columns in the returned :class:`~getml.DataFrames` s have roles :const:`~getml.data.roles.unused_string` or :const:`~getml.data.roles.unused_float`. Before using them in an analysis, a data model needs to be constructed using :class:`~getml.data.Placeholder` s. """ ds_name = "loans" return _load_dataset( ds_name=ds_name, roles=roles, units=units, as_pandas=as_pandas, as_dict=as_dict, )
[docs]def load_occupancy( roles: bool = True, as_pandas: bool = False, as_dict: bool = False, ) -> Union[Tuple[DataFrameT, ...], Dict[str, DataFrameT]]: """ Binary classification dataset on occupancy detection The occupancy detection data set is a very simple multivariate time series from the `UCI Machine Learning Repository <https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+>`_. It is a binary classification problem. The task is to predict room occupancy from Temperature, Humidity, Light and CO2. The original publication is: Candanedo, L. M., & Feldheim, V. (2016). Accurate occupancy detection of an office room from light, temperature, humidity and CO2 measurements using statistical learning models. Energy and Buildings, 112, 28-39. Args: roles (bool): Return data with roles set as_pandas (bool): Return data as `pandas.DataFrame` s as_dict (bool): Return data as dict with `df.name` s as keys and `df` s as values. Returns: tuple: Tuple containing (sorted alphabetically by `df.name`s) the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True) or dict: if `as_dict` is `True`: Dictionary containing the data as :class:`~getml.DataFrame` s or :class:`pandas.DataFrame` s (if `as_pandas` is True). The keys correspond to the name of the DataFrame on the :mod:`~getml.engine`. The following DataFrames are returned: * population_train * population_test * population_validation Examples: >>> population_train, population_test, _ = getml.datasets.load_occupancy() >>> type(occupancy_train) ... getml.data.data_frame.DataFrame For an full analysis of the occupancy dataset including all necessary preprocessing steps please refer to `getml-examples <https://github.com/getml/getml-examples/tree/master/occupancy>`_. Note: Roles can be set ad-hoc by supplying the respective flag. If `roles` is `False`, all columns in the returned :class:`~getml.DataFrames` s have roles :const:`~getml.data.roles.unused_string` or :const:`~getml.data.roles.unused_float`. This dataset contains no units. Before using them in an analysis, a data model needs to be constructed using :class:`~getml.data.Placeholder` s. """ ds_name = "occupancy" return _load_dataset( ds_name=ds_name, roles=roles, as_pandas=as_pandas, as_dict=as_dict, )