# Copyright 2022 The SQLNet Company GmbH
#
# This file is licensed under the Elastic License 2.0 (ELv2).
# Refer to the LICENSE.txt file in the root of the repository
# for details.
#
"""
Contains hyperparameter optimization routines.
"""
import copy
import json
import time
from typing import Any, Dict, List, Union
import getml.communication as comm
from getml.data import Container, StarSchema, TimeSeries
from getml.data.helpers import _remove_trailing_underscores
from getml.pipeline import delete, exists, load, metrics, Pipeline
from getml.pipeline.helpers import _make_id, _print_time_taken, _transform_peripheral
from getml.utilities.formatting import _SignatureFormatter
from .burn_in import latin_hypercube, random
from .kernels import matern52
from .optimization import nelder_mead
from .validation import _validate_hyperopt
# -----------------------------------------------------------------------------
def _get_json_obj(name):
"""
Retrieves a JSON representation of the hyperopt object *name*
from the engine.
"""
cmd: Dict[str, Any] = {}
cmd["name_"] = name
cmd["type_"] = "Hyperopt.refresh"
with comm.send_and_get_socket(cmd) as sock:
msg = comm.recv_string(sock)
if msg[0] != "{":
comm.engine_exception_handler(msg)
return json.loads(msg)
# -----------------------------------------------------------------------------
class _Hyperopt:
"""
Base class that is not meant to be called directly by the user.
"""
def __init__(
self,
param_space: Dict[str, Any],
pipeline: Pipeline,
score: str,
n_iter: int,
seed: int,
ratio_iter=1.0,
optimization_algorithm=nelder_mead,
optimization_burn_in_algorithm=latin_hypercube,
optimization_burn_ins=15,
surrogate_burn_in_algorithm=latin_hypercube,
gaussian_kernel=matern52,
gaussian_optimization_burn_in_algorithm=latin_hypercube,
gaussian_optimization_algorithm=nelder_mead,
gaussian_optimization_burn_ins=50,
gaussian_nugget=50,
early_stopping=True,
):
self._id = "NOT SENT TO ENGINE"
self._type = "_Hyperopt"
self._score = score
self._original_param_space = param_space
self.evaluations: List[Any] = []
self.pipeline = copy.deepcopy(pipeline)
self.param_space = param_space
self.n_iter = n_iter
self.seed = seed
self.ratio_iter = ratio_iter
self.optimization_algorithm = optimization_algorithm
self.optimization_burn_in_algorithm = optimization_burn_in_algorithm
self.optimization_burn_ins = optimization_burn_ins
self.surrogate_burn_in_algorithm = surrogate_burn_in_algorithm
self.gaussian_kernel = gaussian_kernel
self.gaussian_optimization_algorithm = gaussian_optimization_algorithm
self.gaussian_optimization_burn_in_algorithm = (
gaussian_optimization_burn_in_algorithm
)
self.gaussian_optimization_burn_ins = gaussian_optimization_burn_ins
self.gaussian_nugget = gaussian_nugget
self.early_stopping = early_stopping
_Hyperopt._supported_params = list(self.__dict__.keys()) # type: ignore
# ----------------------------------------------------------------
def __repr__(self):
return str(self)
# ------------------------------------------------------------
def _append_underscore(self, some_dict):
"""Helper functions that returns a trailing underscore to all keys in a dict"""
cmd: Dict[str, Any] = {}
for kkey in some_dict:
if kkey == "evaluations":
cmd[kkey + "_"] = some_dict[kkey]
elif isinstance(some_dict[kkey], dict):
cmd[kkey + "_"] = self._append_underscore(some_dict[kkey])
elif isinstance(some_dict[kkey], list):
cmd[kkey + "_"] = [
self._append_underscore(elem) if isinstance(elem, dict) else elem
for elem in some_dict[kkey]
]
else:
cmd[kkey + "_"] = some_dict[kkey]
return cmd
# ------------------------------------------------------------
def _best_pipeline_name(self):
if not self.evaluations:
raise ValueError("The hyperparameter optimization has not been fitted!")
def key(x):
return x["evaluation"]["score"]
# The hyperparameter optimization always minimizes.
# Scores like AUC or RSquared are multiplied by -1.
return min(self.evaluations, key=key)["pipeline_name"]
# ------------------------------------------------------------
def _getml_deserialize(self) -> Dict[str, Any]:
"""
Expresses the hyperparameter optimization in a form the engine can understand.
"""
cmd = self._append_underscore(self.__dict__)
del cmd["_id_"]
del cmd["_score_"]
del cmd["_type_"]
del cmd["_original_param_space_"]
cmd["name_"] = self.id
cmd["score_"] = self.score
cmd["type_"] = self.type
cmd["pipeline_"] = self.pipeline._getml_deserialize()
return cmd
# ----------------------------------------------------------------
def _parse_json_obj(self, json_obj: Dict[str, Any]) -> "_Hyperopt":
pipeline = self.pipeline._parse_cmd(json_obj["pipeline_"])
del json_obj["pipeline_"]
kwargs = _remove_trailing_underscores(json_obj)
evaluations: List[Any] = []
if "evaluations" in kwargs:
evaluations = kwargs["evaluations"]
del kwargs["evaluations"]
param_space = kwargs["param_space"]
del kwargs["param_space"]
del kwargs["name"]
del kwargs["type"]
id_ = self.id
self.__init__(param_space=param_space, pipeline=pipeline, **kwargs) # type: ignore
self._id = id_
self.evaluations = evaluations
return self
# ----------------------------------------------------------------
def _save(self):
cmd = dict()
cmd["type_"] = "Hyperopt.save"
cmd["name_"] = self.id
comm.send(cmd)
# ------------------------------------------------------------
def _send(self):
self._id = _make_id()
self.pipeline._id = self._id
cmd = self._getml_deserialize()
comm.send(cmd)
return self
# ------------------------------------------------------------
@property
def best_pipeline(self):
"""
The best pipeline that is part of the hyperparameter optimization.
This is always based on the validation
data you have passed even if you have chosen to
score the pipeline on other data afterwards.
"""
name = self._best_pipeline_name()
return load(name)
# ------------------------------------------------------------
def clean_up(self):
"""
Deletes all pipelines associated with hyperparameter optimization,
but the best pipeline.
"""
best_pipeline = self._best_pipeline_name()
names = [obj["pipeline_name"] for obj in self.evaluations]
for name in names:
if name == best_pipeline:
continue
if exists(name):
delete(name)
# ------------------------------------------------------------
def fit(
self,
container: Union[Container, StarSchema, TimeSeries],
train: str = "train",
validation: str = "validation",
):
"""Launches the hyperparameter optimization.
Args:
container (:class:`~getml.data.Container`):
The data container used for the hyperparameter tuning.
train (str, optional):
The name of the subset in 'container' used for training.
validation (str, optional):
The name of the subset in 'container' used for validation.
"""
if isinstance(container, (StarSchema, TimeSeries)):
container = container.container
if not isinstance(container, Container):
raise TypeError(
"'container' must be a `~getml.data.Container`, "
+ "a `~getml.data.StarSchema` or a `~getml.data.TimeSeries`"
)
if not isinstance(train, str):
raise TypeError("""'train' must be a string""")
if not isinstance(validation, str):
raise TypeError("""'validation' must be a string""")
self.pipeline.check(container[train])
population_table_training = container[train].population
population_table_validation = container[validation].population
peripheral_tables = _transform_peripheral(
container[train].peripheral, self.pipeline.peripheral
)
self._send()
cmd: Dict[str, Any] = {}
cmd["name_"] = self.id
cmd["type_"] = "Hyperopt.launch"
cmd["population_training_df_"] = population_table_training._getml_deserialize()
cmd[
"population_validation_df_"
] = population_table_validation._getml_deserialize()
cmd["peripheral_dfs_"] = [
elem._getml_deserialize() for elem in peripheral_tables
]
with comm.send_and_get_socket(cmd) as sock:
begin = time.time()
msg = comm.log(sock)
end = time.time()
if msg != "Success!":
comm.engine_exception_handler(msg)
print()
_print_time_taken(begin, end, "Time taken: ")
self._save()
return self.refresh()
# ------------------------------------------------------------
@property
def id(self) -> str:
"""
Name of the hyperparameter optimization.
This is used to uniquely identify it on the engine.
"""
return self._id
# ------------------------------------------------------------
@property
def name(self) -> str:
"""
Returns the ID of the hyperparameter optimization.
The name property is kept for backward compatibility.
"""
return self._id
# ------------------------------------------------------------
def refresh(self) -> "_Hyperopt":
"""Reloads the hyperparameter optimization from the engine.
Returns:
:class:`~getml.Pipeline`:
Current instance
"""
json_obj = _get_json_obj(self.id)
return self._parse_json_obj(json_obj)
# ------------------------------------------------------------
@property
def score(self):
"""
The score to be optimized.
"""
return self._score
# ------------------------------------------------------------
@property
def type(self):
"""
The algorithm used for the hyperparameter optimization.
"""
return self._type
# -----------------------------------------------------------------------------
[docs]class GaussianHyperparameterSearch(_Hyperopt):
"""Bayesian hyperparameter optimization using a Gaussian process.
After a burn-in period,
a Gaussian process is used to pick the most promising
parameter combination to be evaluated next based on the knowledge gathered
throughout previous evaluations. Accessing the quality of potential
combinations will be done using the expected information (EI).
Args:
param_space (dict):
Dictionary containing numerical arrays of length two
holding the lower and upper bounds of all parameters which
will be altered in `pipeline` during the hyperparameter
optimization.
If we have two feature learners and one predictor,
the hyperparameter space might look like this:
.. code-block:: python
param_space = {
"feature_learners": [
{
"num_features": [10, 50],
},
{
"max_depth": [1, 10],
"min_num_samples": [100, 500],
"num_features": [10, 50],
"reg_lambda": [0.0, 0.1],
"shrinkage": [0.01, 0.4]
}],
"predictors": [
{
"reg_lambda": [0.0, 10.0]
}
]
}
If we only want to optimize the predictor, then
we can leave out the feature learners.
pipeline (:class:`~getml.Pipeline`):
Base pipeline used to derive all models fitted and scored
during the hyperparameter optimization. Be careful when
constructing it since only the parameters present in
`param_space` will be overwritten. It defines the data
schema and any hyperparameters that are not optimized.
score (str, optional):
The score to optimize. Must be from
:mod:`~getml.pipeline.metrics`.
n_iter (int, optional):
Number of iterations in the hyperparameter optimization
and thus the number of parameter combinations to draw and
evaluate. Range: [1, :math:`\\infty`]
seed (int, optional):
Seed used for the random number generator that underlies
the sampling procedure to make the calculation
reproducable. Due to nature of the underlying algorithm,
this is only the case if the fit is done without
multithreading. To reflect this, a `seed` of None
is only allowed to be set
to an actual integer if both ``num_threads`` and
``n_jobs`` instance variables of the ``predictor`` and
``feature_selector`` in `model` - if they are instances of
either :class:`~getml.predictors.XGBoostRegressor` or
:class:`~getml.predictors.XGBoostClassifier` - are set to
1. Internally, a `seed` of None will be mapped to
5543. Range: [0, :math:`\\infty`]
ratio_iter (float, optional):
Ratio of the iterations used for the burn-in.
For a `ratio_iter` of 1.0, all iterations will be
spent in the burn-in period resulting in an equivalence of
this class to
:class:`~getml.hyperopt.LatinHypercubeSearch` or
:class:`~getml.hyperopt.RandomSearch` - depending on
`surrogate_burn_in_algorithm`. Range: [0, 1]
As a *rule of thumb* at least 70 percent of the evaluations
should be spent in the burn-in phase. The more comprehensive
the exploration of the `param_space` during the burn-in,
the less likely it is that the Gaussian process gets stuck in
local minima.
optimization_algorithm (string, optional):
Determines the optimization algorithm used for the local
search in the optimization of the expected information
(EI). Must be from
:mod:`~getml.hyperopt.optimization`.
optimization_burn_in_algorithm (string, optional):
Specifies the algorithm used to draw initial points in the
burn-in period of the optimization of the expected
information (EI). Must be from :mod:`~getml.hyperopt.burn_in`.
optimization_burn_ins (int, optional):
Number of random evaluation points used during the burn-in
of the minimization of the expected information (EI).
After the surrogate model - the Gaussian process - was
successfully fitted to the previous parameter combination,
the algorithm is able to calculate the EI for a given point. In
order to get to the next combination, the EI has to be
maximized over the whole parameter space. Much like the
GaussianProcess itself, this requires a burn-in phase.
Range: [3, :math:`\\infty`]
surrogate_burn_in_algorithm (string, optional):
Specifies the algorithm used to draw new parameter
combinations during the burn-in period.
Must be from :mod:`~getml.hyperopt.burn_in`.
gaussian_kernel (string, optional):
Specifies the 1-dimensional kernel of the Gaussian process
which will be used along each dimension of the parameter
space. All of the choices below will result in continuous
sample paths and their main difference is the degree of
smoothness of the results with 'exp' yielding the least
and 'gauss' yielding the most smooth paths.
Must be from :mod:`~getml.hyperopt.kernels`.
gaussian_optimization_algorithm (string, optional):
Determines the optimization algorithm used for the local
search in the fitting of the Gaussian process to the
previous parameter combinations. Must be from
:mod:`~getml.hyperopt.optimization`.
gaussian_optimization_burn_in_algorithm (string, optional):
Specifies the algorithm used to draw new parameter
combinations during the burn-in period of the optimization
of the Gaussian process.
Must be from :mod:`~getml.hyperopt.burn_in`.
gaussian_optimization_burn_ins (int, optional):
Number of random evaluation points used during the burn-in
of the fitting of the Gaussian process. Range: [3,
:math:`\\infty`]
early_stopping (bool, optional):
Whether you want to apply early stopping to the predictors.
Note:
A Gaussian hyperparameter search works like this:
- It begins with a burn-in phase, usually about 70% to 90%
of all iterations. During that burn-in phase, the hyperparameter
space is sampled more or less at random. You can control
this phase using ``ratio_iter`` and ``surrogate_burn_in_algorithm``.
- Once enough information has been collected, it fits a
Gaussian process on the hyperparameters with the ``score`` we want to
maximize or minimize as the predicted variable. Note that the
Gaussian process has hyperparameters itself, which are also optimized.
You can control this phase using ``gaussian_kernel``,
``gaussian_optimization_algorithm``,
``gaussian_optimization_burn_in_algorithm`` and
``gaussian_optimization_burn_ins``.
- It then uses the Gaussian process to predict the expected information
(EI), which is how much additional information it might get from
evaluating
a particular point in the hyperparameter space. The expected information
is to be maximized. The point in the hyperparameter space with
the maximum expected information is the next point that is actually
evaluated (meaning a new pipeline with these hyperparameters is trained).
You can control this phase using ``optimization_algorithm``,
``optimization_burn_ins`` and ``optimization_burn_in_algorithm``.
In a nutshell, the GaussianHyperparameterSearch behaves like human data scientists:
- At first, it picks random hyperparameter combinations.
- Once it has gained a better understanding of the hyperparameter space,
it starts evaluating hyperparameter combinations that are
particularly interesting.
References:
- `Carl Edward Rasmussen and Christopher K. I. Williams, MIT
Press, 2006 <http://www.gaussianprocess.org/gpml/>`_
- `Julien Villemonteix, Emmanuel Vazquez, and Eric Walter, 2009
<https://arxiv.org/pdf/cs/0611143.pdf>`_
Example:
.. code-block:: python
from getml import data
from getml import datasets
from getml import engine
from getml import feature_learning
from getml.feature_learning import aggregations
from getml.feature_learning import loss_functions
from getml import hyperopt
from getml import pipeline
from getml import predictors
# ----------------
engine.set_project("examples")
# ----------------
population_table, peripheral_table = datasets.make_numerical()
# ----------------
# Construct placeholders
population_placeholder = data.Placeholder("POPULATION")
peripheral_placeholder = data.Placeholder("PERIPHERAL")
population_placeholder.join(peripheral_placeholder, "join_key", "time_stamp")
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
fe1 = feature_learning.Multirel(
aggregation=[
aggregations.Count,
aggregations.Sum
],
loss_function=loss_functions.SquareLoss,
num_features=10,
share_aggregations=1.0,
max_length=1,
num_threads=0
)
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
fe2 = feature_learning.Relboost(
loss_function=loss_functions.SquareLoss,
num_features=10
)
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
predictor = predictors.LinearRegression()
# ----------------
pipe = pipeline.Pipeline(
population=population_placeholder,
peripheral=[peripheral_placeholder],
feature_learners=[fe1, fe2],
predictors=[predictor]
)
# ----------------
# Build a hyperparameter space.
# We have two feature learners and one
# predictor, so this is how we must
# construct our hyperparameter space.
# If we only wanted to optimize the predictor,
# we could just leave out the feature_learners.
param_space = {
"feature_learners": [
{
"num_features": [10, 50],
},
{
"max_depth": [1, 10],
"min_num_samples": [100, 500],
"num_features": [10, 50],
"reg_lambda": [0.0, 0.1],
"shrinkage": [0.01, 0.4]
}],
"predictors": [
{
"reg_lambda": [0.0, 10.0]
}
]
}
# ----------------
# Wrap a GaussianHyperparameterSearch around the reference model
gaussian_search = hyperopt.GaussianHyperparameterSearch(
pipeline=pipe,
param_space=param_space,
n_iter=30,
score=pipeline.metrics.rsquared
)
gaussian_search.fit(
population_table_training=population_table,
population_table_validation=population_table,
peripheral_tables=[peripheral_table]
)
# ----------------
# We want 5 additional iterations.
gaussian_search.n_iter = 5
# We do not want another burn-in-phase,
# so we set ratio_iter to 0.
gaussian_search.ratio_iter = 0.0
# This widens the hyperparameter space.
gaussian_search.param_space["feature_learners"][1]["num_features"] = [10, 100]
# This narrows the hyperparameter space.
gaussian_search.param_space["predictors"][0]["reg_lambda"] = [0.0, 0.0]
# This continues the hyperparameter search using the previous iterations as
# prior knowledge.
gaussian_search.fit(
population_table_training=population_table,
population_table_validation=population_table,
peripheral_tables=[peripheral_table]
)
# ----------------
all_hyp = hyperopt.list_hyperopts()
best_pipeline = gaussian_search.best_pipeline
Note:
Not supported in the getML community edition.
"""
def __init__(
self,
param_space: Dict[str, Any],
pipeline: Pipeline,
score=metrics.rmse,
n_iter=100,
seed=5483,
ratio_iter=0.80,
optimization_algorithm=nelder_mead,
optimization_burn_in_algorithm=latin_hypercube,
optimization_burn_ins=500,
surrogate_burn_in_algorithm=latin_hypercube,
gaussian_kernel=matern52,
gaussian_optimization_burn_in_algorithm=latin_hypercube,
gaussian_optimization_algorithm=nelder_mead,
gaussian_optimization_burn_ins=500,
gaussian_nugget=50,
early_stopping=True,
):
super().__init__(
param_space=param_space,
pipeline=pipeline,
score=score,
n_iter=n_iter,
seed=seed,
ratio_iter=ratio_iter,
optimization_algorithm=optimization_algorithm,
optimization_burn_in_algorithm=optimization_burn_in_algorithm,
optimization_burn_ins=optimization_burn_ins,
surrogate_burn_in_algorithm=surrogate_burn_in_algorithm,
gaussian_kernel=gaussian_kernel,
gaussian_optimization_algorithm=gaussian_optimization_algorithm,
gaussian_optimization_burn_in_algorithm=gaussian_optimization_burn_in_algorithm,
gaussian_optimization_burn_ins=gaussian_optimization_burn_ins,
gaussian_nugget=gaussian_nugget,
early_stopping=early_stopping,
)
self._type = "GaussianHyperparameterSearch"
self.validate()
# ----------------------------------------------------------------
def __str__(self):
obj_dict = copy.deepcopy(self.__dict__)
del obj_dict["pipeline"]
del obj_dict["param_space"]
del obj_dict["evaluations"]
obj_dict["type"] = self.type
obj_dict["score"] = self.score
sig = _SignatureFormatter(data=obj_dict)
return sig._format()
# ------------------------------------------------------------
[docs] def validate(self):
"""
Validate the parameters of the hyperparameter optimization.
"""
_validate_hyperopt(_Hyperopt._supported_params, **self.__dict__) # type: ignore
# -----------------------------------------------------------------------------
[docs]class LatinHypercubeSearch(_Hyperopt):
"""Latin hypercube sampling of the hyperparameters.
Uses a multidimensional, uniform cumulative distribution function
to drawn the random numbers from. For drawing `n_iter` samples,
the distribution will be divided in `n_iter`*`n_iter` hypercubes
of equal size (`n_iter` per dimension). `n_iter` of them will be
selected in such a way only one per dimension is used and an
independent and identically-distributed (iid) random number is
drawn within the boundaries of the hypercube.
A latin hypercube search can be seen as a compromise between
a grid search, which iterates through the entire hyperparameter
space, and a random search, which draws completely random samples
from the hyperparameter space.
Args:
param_space (dict):
Dictionary containing numerical arrays of length two
holding the lower and upper bounds of all parameters which
will be altered in `pipeline` during the hyperparameter
optimization.
If we have two feature learners and one predictor,
the hyperparameter space might look like this:
.. code-block:: python
param_space = {
"feature_learners": [
{
"num_features": [10, 50],
},
{
"max_depth": [1, 10],
"min_num_samples": [100, 500],
"num_features": [10, 50],
"reg_lambda": [0.0, 0.1],
"shrinkage": [0.01, 0.4]
}],
"predictors": [
{
"reg_lambda": [0.0, 10.0]
}
]
}
If we only want to optimize the predictor, then
we can leave out the feature learners.
pipeline (:class:`~getml.Pipeline`):
Base pipeline used to derive all models fitted and scored
during the hyperparameter optimization. Be careful in
constructing it since only those parameters present in
`param_space` will be overwritten. It defines the data
schema and any hyperparameters that are not optimized.
score (str, optional):
The score to optimize. Must be from
:mod:`~getml.pipeline.metrics`.
n_iter (int, optional):
Number of iterations in the hyperparameter optimization
and thus the number of parameter combinations to draw and
evaluate. Range: [1, :math:`\\infty`]
seed (int, optional):
Seed used for the random number generator that underlies
the sampling procedure to make the calculation
reproducible. Due to nature of the underlying algorithm
this is only the case if the fit is done without
multithreading. To reflect this, a `seed` of None
represents an unreproducible and is only allowed to be set
to an actual integer if both ``num_threads`` and
``n_jobs`` instance variables of the ``predictor`` and
``feature_selector`` in `model` - if they are instances of
either :class:`~getml.predictors.XGBoostRegressor` or
:class:`~getml.predictors.XGBoostClassifier` - are set to
1. Internally, a `seed` of None will be mapped to
5543. Range: [0, :math:`\\infty`]
Example:
.. code-block:: python
from getml import data
from getml import datasets
from getml import engine
from getml import feature_learning
from getml.feature_learning import aggregations
from getml.feature_learning import loss_functions
from getml import hyperopt
from getml import pipeline
from getml import predictors
# ----------------
engine.set_project("examples")
# ----------------
population_table, peripheral_table = datasets.make_numerical()
# ----------------
# Construct placeholders
population_placeholder = data.Placeholder("POPULATION")
peripheral_placeholder = data.Placeholder("PERIPHERAL")
population_placeholder.join(peripheral_placeholder, "join_key", "time_stamp")
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
fe1 = feature_learning.Multirel(
aggregation=[
aggregations.Count,
aggregations.Sum
],
loss_function=loss_functions.SquareLoss,
num_features=10,
share_aggregations=1.0,
max_length=1,
num_threads=0
)
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
fe2 = feature_learning.Relboost(
loss_function=loss_functions.SquareLoss,
num_features=10
)
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
predictor = predictors.LinearRegression()
# ----------------
pipe = pipeline.Pipeline(
population=population_placeholder,
peripheral=[peripheral_placeholder],
feature_learners=[fe1, fe2],
predictors=[predictor]
)
# ----------------
# Build a hyperparameter space.
# We have two feature learners and one
# predictor, so this is how we must
# construct our hyperparameter space.
# If we only wanted to optimize the predictor,
# we could just leave out the feature_learners.
param_space = {
"feature_learners": [
{
"num_features": [10, 50],
},
{
"max_depth": [1, 10],
"min_num_samples": [100, 500],
"num_features": [10, 50],
"reg_lambda": [0.0, 0.1],
"shrinkage": [0.01, 0.4]
}],
"predictors": [
{
"reg_lambda": [0.0, 10.0]
}
]
}
# ----------------
# Wrap a LatinHypercubeSearch around the reference model
latin_search = hyperopt.LatinHypercubeSearch(
pipeline=pipe,
param_space=param_space,
n_iter=30,
score=pipeline.metrics.rsquared
)
latin_search.fit(
population_table_training=population_table,
population_table_validation=population_table,
peripheral_tables=[peripheral_table]
)
Note:
Not supported in the getML community edition.
"""
def __init__(
self,
param_space: Dict[str, Any],
pipeline: Pipeline,
score=metrics.rmse,
n_iter=100,
seed=5483,
**kwargs,
):
super().__init__(
param_space=param_space,
pipeline=pipeline,
score=score,
n_iter=n_iter,
seed=seed,
**kwargs,
)
self._type = "LatinHypercubeSearch"
self.surrogate_burn_in_algorithm = latin_hypercube
self.validate()
# ----------------------------------------------------------------
def __str__(self):
obj_dict = dict()
obj_dict["type"] = self.type
obj_dict["score"] = self.score
obj_dict["n_iter"] = self.n_iter
obj_dict["seed"] = self.seed
sig = _SignatureFormatter(data=obj_dict)
return sig._format()
# ------------------------------------------------------------
[docs] def validate(self):
"""
Validate the parameters of the hyperparameter optimization.
"""
_validate_hyperopt(_Hyperopt._supported_params, **self.__dict__) # type: ignore
if self.surrogate_burn_in_algorithm != latin_hypercube:
raise ValueError(
"'surrogate_burn_in_algorithm' must be '" + latin_hypercube + "'."
)
if self.ratio_iter != 1.0:
raise ValueError("'ratio_iter' must be 1.0.")
# -----------------------------------------------------------------------------
[docs]class RandomSearch(_Hyperopt):
"""Uniformly distributed sampling of the hyperparameters.
During every iteration, a new set of hyperparameters is chosen at random
by uniformly drawing a random value in between the lower and upper
bound for each dimension of `param_space` independently.
Args:
param_space (dict):
Dictionary containing numerical arrays of length two
holding the lower and upper bounds of all parameters which
will be altered in `pipeline` during the hyperparameter
optimization.
If we have two feature learners and one predictor,
the hyperparameter space might look like this:
.. code-block:: python
param_space = {
"feature_learners": [
{
"num_features": [10, 50],
},
{
"max_depth": [1, 10],
"min_num_samples": [100, 500],
"num_features": [10, 50],
"reg_lambda": [0.0, 0.1],
"shrinkage": [0.01, 0.4]
}],
"predictors": [
{
"reg_lambda": [0.0, 10.0]
}
]
}
If we only want to optimize the predictor, then
we can leave out the feature learners.
pipeline (:class:`~getml.Pipeline`):
Base pipeline used to derive all models fitted and scored
during the hyperparameter optimization. Be careful in
constructing it since only those parameters present in
`param_space` will be overwritten. It defines the data
schema and any hyperparameters that are not optimized.
score (str, optional):
The score to optimize. Must be from
:mod:`~getml.pipeline.metrics`.
n_iter (int, optional):
Number of iterations in the hyperparameter optimization
and thus the number of parameter combinations to draw and
evaluate. Range: [1, :math:`\\infty`]
seed (int, optional):
Seed used for the random number generator that underlies
the sampling procedure to make the calculation
reproducible. Due to nature of the underlying algorithm
this is only the case if the fit is done without
multithreading. To reflect this, a `seed` of None
represents an unreproducible and is only allowed to be set
to an actual integer if both ``num_threads`` and
``n_jobs`` instance variables of the ``predictor`` and
``feature_selector`` in `model` - if they are instances of
either :class:`~getml.predictors.XGBoostRegressor` or
:class:`~getml.predictors.XGBoostClassifier` - are set to
1. Internally, a `seed` of None will be mapped to
5543. Range: [0, :math:`\\infty`]
Example:
.. code-block:: python
from getml import data
from getml import datasets
from getml import engine
from getml import feature_learning
from getml.feature_learning import aggregations
from getml.feature_learning import loss_functions
from getml import hyperopt
from getml import pipeline
from getml import predictors
# ----------------
engine.set_project("examples")
# ----------------
population_table, peripheral_table = datasets.make_numerical()
# ----------------
# Construct placeholders
population_placeholder = data.Placeholder("POPULATION")
peripheral_placeholder = data.Placeholder("PERIPHERAL")
population_placeholder.join(peripheral_placeholder, "join_key", "time_stamp")
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
fe1 = feature_learning.Multirel(
aggregation=[
aggregations.Count,
aggregations.Sum
],
loss_function=loss_functions.SquareLoss,
num_features=10,
share_aggregations=1.0,
max_length=1,
num_threads=0
)
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
fe2 = feature_learning.Relboost(
loss_function=loss_functions.SquareLoss,
num_features=10
)
# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.
predictor = predictors.LinearRegression()
# ----------------
pipe = pipeline.Pipeline(
population=population_placeholder,
peripheral=[peripheral_placeholder],
feature_learners=[fe1, fe2],
predictors=[predictor]
)
# ----------------
# Build a hyperparameter space.
# We have two feature learners and one
# predictor, so this is how we must
# construct our hyperparameter space.
# If we only wanted to optimize the predictor,
# we could just leave out the feature_learners.
param_space = {
"feature_learners": [
{
"num_features": [10, 50],
},
{
"max_depth": [1, 10],
"min_num_samples": [100, 500],
"num_features": [10, 50],
"reg_lambda": [0.0, 0.1],
"shrinkage": [0.01, 0.4]
}],
"predictors": [
{
"reg_lambda": [0.0, 10.0]
}
]
}
# ----------------
# Wrap a RandomSearch around the reference model
random_search = hyperopt.RandomSearch(
pipeline=pipe,
param_space=param_space,
n_iter=30,
score=pipeline.metrics.rsquared
)
random_search.fit(
population_table_training=population_table,
population_table_validation=population_table,
peripheral_tables=[peripheral_table]
)
Note:
Not supported in the getML community edition.
"""
def __init__(
self,
param_space: Dict[str, Any],
pipeline: Pipeline,
score=metrics.rmse,
n_iter=100,
seed=5483,
**kwargs,
):
super().__init__(
param_space=param_space,
pipeline=pipeline,
score=score,
n_iter=n_iter,
seed=seed,
**kwargs,
)
self._type = "RandomSearch"
self.surrogate_burn_in_algorithm = random
self.validate()
# ----------------------------------------------------------------
def __str__(self):
obj_dict: Dict[str, Any] = {}
obj_dict["type"] = self.type
obj_dict["score"] = self.score
obj_dict["n_iter"] = self.n_iter
obj_dict["seed"] = self.seed
sig = _SignatureFormatter(data=obj_dict)
return sig._format()
# ------------------------------------------------------------
[docs] def validate(self):
"""
Validate the parameters of the hyperparameter optimization.
"""
_validate_hyperopt(_Hyperopt._supported_params, **self.__dict__) # type: ignore
if self.surrogate_burn_in_algorithm != random:
raise ValueError("'surrogate_burn_in_algorithm' must be '" + random + "'.")
if self.ratio_iter != 1.0:
raise ValueError("'ratio_iter' must be 1.0.")
# -----------------------------------------------------------------------------