Source code for getml.pipeline.pipelines

# Copyright 2021 The SQLNet Company GmbH

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

"""
Container which holds all of a project's pipelines.
"""

from getml.utilities.formatting import _Formatter

from .helpers2 import _refresh_all, list_pipelines
from .scores import accuracy, auc, cross_entropy, mae, rmse, rsquared

# --------------------------------------------------------------------


[docs]class Pipelines:
    """
    Container which holds all pipelines associated with the currently running
    project. The container supports slicing and is sort- and filterable.

        Example:

            Show the first 10 pipelines belonging to the current project:

            .. code-block:: python

                getml.project.pipelines[:10]

            You can use nested list comprehensions to retrieve a scoring history
            for your project:

            .. code-block:: python

                import matplotlib.pyplot as plt

                hyperopt_scores = [(score.date_time, score.mae) for pipe in getml.project.pipelines
                                      for score in pipe.scores["data_test"]
                                      if "hyperopt" in pipe.tags]

                fig, ax = plt.subplots()
                ax.bar(*zip(*hyperopt_scores))
    """

    # ----------------------------------------------------------------

    def __init__(self, data=None):
        self.ids = list_pipelines()

        if data is None:
            self.data = _refresh_all()
        else:
            self.data = data

    # ----------------------------------------------------------------

    def __getitem__(self, key):
        if isinstance(key, int):
            return self.data[key]
        if isinstance(key, slice):
            pipelines_subset = self.data[key]
            return Pipelines(data=pipelines_subset)
        if isinstance(key, str):
            if key in self.ids:
                return [pipeline for pipeline in self.data if pipeline.id == key][0]
            raise AttributeError(f"No Pipeline with id: {key}")
        raise TypeError(
            f"Pipelines can only be indexed by: int, slices, or str, not {type(key).__name__}"
        )

    # ----------------------------------------------------------------

    def __len__(self):
        return len(self.data)

    # ----------------------------------------------------------------

    def __repr__(self):
        if len(self.ids) == 0:
            return "No pipelines in memory."

        return self._format()._render_string()

    # ----------------------------------------------------------------

    def _repr_html_(self):
        if len(self.ids) == 0:
            return "<p>No pipelines in memory.</p>"
        return self._format()._render_html()

    # ----------------------------------------------------------------

    @property
    def _contains_regresion_pipelines(self):
        return any(pipe.is_regression for pipe in self.data)

    # ----------------------------------------------------------------

    @property
    def _contains_classification_pipelines(self):
        return any(pipe.is_classification for pipe in self.data)

    # ----------------------------------------------------------------

    def _format(self):

        scores = []
        scores_headers = []

        # ------------------------------------------------------------

        if self._contains_classification_pipelines:
            scores.extend(
                [
                    [
                        pipeline._scores.get(accuracy, []),
                        pipeline._scores.get(auc, []),
                        pipeline._scores.get(cross_entropy, []),
                    ]
                    for pipeline in self.data
                ]
            )

            scores_headers.extend([accuracy, auc, cross_entropy])

        # ------------------------------------------------------------

        if self._contains_regresion_pipelines:
            scores.extend(
                [
                    [
                        pipeline._scores.get(mae, []),
                        pipeline._scores.get(rmse, []),
                        pipeline._scores.get(rsquared, []),
                    ]
                    for pipeline in self.data
                ]
            )

            scores_headers.extend([mae, rmse, rsquared])

        # ------------------------------------------------------------

        sets_used = [pipeline._scores.get("set_used", "") for pipeline in self.data]

        targets = [pipeline.targets for pipeline in self.data]

        feature_learners = [
            [feature_learner.type for feature_learner in pipeline.feature_learners]
            for pipeline in self.data
        ]

        tags = [pipeline.tags for pipeline in self.data]

        headers = [
            [
                "id",
                "tags",
                "feature learners",
                "targets",
                *scores_headers,
                "set used",
            ]
        ]

        rows = [
            [
                pipeline.id,
                tags[index],
                feature_learners[index],
                targets[index],
                *scores[index],
                sets_used[index],
            ]
            for index, pipeline in enumerate(self.data)
        ]

        # ------------------------------------------------------------

        return _Formatter(headers, rows)

    # ----------------------------------------------------------------

[docs]    def sort(self, key, descending=False):
        """
        Sorts the pipelines container.

        Args:
            key (callable, optional):
                A callable that evaluates to a sort key for a given item.
            descending (bool, optional):
                Whether to sort in descending order.

        Returns:
            :class:`getml.pipeline.Pipelines`:
                A container of sorted pipelines.

        Example:
            .. code-block:: python

                by_auc = getml.project.pipelines.sort(key=lambda pipe: pipe.auc)

                by_fl = getml.project.pipelines.sort(key=lambda pipe: pipe.feature_learners[0].type)

        """
        pipelines_sorted = sorted(self.data, key=key, reverse=descending)
        return Pipelines(data=pipelines_sorted)

    # ----------------------------------------------------------------

[docs]    def filter(self, conditional):
        """
        Filters the pipelines container.

        Args:
            conditional (callable):
                A callable that evaluates to a boolean for a given item.

        Returns:
            :class:`getml.pipeline.Pipeline`:
                A container of filtered pipelines.

        Example:
            .. code-block:: python

                pipes_with_tags = getml.project.pipelines.filter(lambda pipe: len(pipe.tags) > 0)

                accurate_pipes = getml.project.pipelines.filter(lambda pipe: all(acc > 0.9 for acc in pipe.accuracy))

        """
        pipelines_filtered = [
            pipeline for pipeline in self.data if conditional(pipeline)
        ]

        return Pipelines(data=pipelines_filtered)