# Copyright 2021 The SQLNet Company GmbH
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
"""
Contains helper functions for validating the feature learning algorithms.
"""
import numbers
import numpy as np
from getml.data.helpers import _is_non_empty_typed_list, _is_typed_list
from getml.helpers import _check_parameter_bounds
from getml.log import MIGHT_TAKE_LONG, logger
from .aggregations import _all_aggregations, _multirel_subset
# --------------------------------------------------------------------
class Validator:
def __init__(self, name):
self.name = name
def __set__(self, instance, value):
# dont call validate upon initialization
if self.name in instance.__dict__:
instance.validate({self.name: value})
instance.__dict__[self.name] = value
# --------------------------------------------------------------------
def _validate_dfs_model_parameters(**kwargs):
# ----------------------------------------------------------------
aggregation = kwargs["aggregation"]
delta_t = kwargs["delta_t"]
loss_function = kwargs["loss_function"]
max_lag = kwargs["max_lag"]
min_df = kwargs["min_df"]
n_most_frequent = kwargs["n_most_frequent"]
num_features = kwargs["num_features"]
num_threads = kwargs["num_threads"]
sampling_factor = kwargs["sampling_factor"]
silent = kwargs["silent"]
vocab_size = kwargs["vocab_size"]
# ----------------------------------------------------------------
if not _is_non_empty_typed_list(aggregation, str):
raise TypeError(
"""'aggregation' must be a non-empty
list of str found in getml.feature_learning.aggregations"""
)
if not isinstance(delta_t, numbers.Real):
raise TypeError("'delta_t' must be a real number")
if not isinstance(loss_function, str) or loss_function is None:
raise TypeError("'loss_function' must be a str")
if not isinstance(max_lag, int):
raise TypeError("'max_lag' must be an integer")
if not isinstance(min_df, numbers.Real):
raise TypeError("'min_df' must be a real number")
if not isinstance(num_features, numbers.Real):
raise TypeError("'num_features' must be a real number")
if not isinstance(n_most_frequent, numbers.Real):
raise TypeError("'n_most_frequent' must be a real number")
if not isinstance(num_threads, numbers.Real):
raise TypeError("'num_threads' must be a real number")
if not isinstance(sampling_factor, numbers.Real):
raise TypeError("'sampling_factor' must be a real number")
if not isinstance(silent, bool):
raise TypeError("'silent' must be of type bool")
if not isinstance(vocab_size, numbers.Real):
raise TypeError("'vocab_size' must be a real number")
# ----------------------------------------------------------------
if not all([aa in _all_aggregations for aa in aggregation]):
raise ValueError(
"""'aggregation' must be a list of the composed of the following
aggregations defined in getml.feature_learning.aggregations: """
+ str(_all_aggregations)
+ "."
)
_check_parameter_bounds(delta_t, "delta_t", [0, np.finfo(np.float64).max])
_check_parameter_bounds(max_lag, "max_lag", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(min_df, "min_df", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(
n_most_frequent, "n_most_frequent", [0, np.iinfo(np.int32).max]
)
_check_parameter_bounds(num_features, "num_features", [1, np.iinfo(np.int32).max])
_check_parameter_bounds(num_threads, "num_threads", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(
sampling_factor, "sampling_factor", [0, np.finfo(np.float64).max]
)
_check_parameter_bounds(vocab_size, "vocab_size", [0, np.finfo(np.float64).max])
# ----------------------------------------------------------------
if (delta_t > 0.0) ^ (max_lag > 0):
raise ValueError(
"""If you pass a non-zero value to delta_t, you must also
pass a non-zero
value to max_lag and vice-versa."""
)
# ----------------------------------------------------------------
# --------------------------------------------------------------------
def _validate_multirel_parameters(**kwargs):
# ----------------------------------------------------------------
aggregation = kwargs["aggregation"]
allow_sets = kwargs["allow_sets"]
delta_t = kwargs["delta_t"]
grid_factor = kwargs["grid_factor"]
loss_function = kwargs["loss_function"]
max_length = kwargs["max_length"]
min_df = kwargs["min_df"]
min_num_samples = kwargs["min_num_samples"]
num_features = kwargs["num_features"]
num_subfeatures = kwargs["num_subfeatures"]
num_threads = kwargs["num_threads"]
propositionalization = kwargs["propositionalization"]
regularization = kwargs["regularization"]
round_robin = kwargs["round_robin"]
sampling_factor = kwargs["sampling_factor"]
seed = kwargs["seed"]
share_aggregations = kwargs["share_aggregations"]
share_conditions = kwargs["share_conditions"]
shrinkage = kwargs["shrinkage"]
vocab_size = kwargs["vocab_size"]
# ----------------------------------------------------------------
if not _is_non_empty_typed_list(aggregation, str):
raise TypeError(
"""'aggregation' must be a non-empty
list of str found in getml.feature_learning.aggregations"""
)
if not isinstance(allow_sets, bool):
raise TypeError("'allow_sets' must be of type bool")
if not isinstance(delta_t, numbers.Real):
raise TypeError("'delta_t' must be a real number")
if not isinstance(grid_factor, numbers.Real):
raise TypeError("'grid_factor' must be a real number")
if not isinstance(loss_function, str) or loss_function is None:
raise TypeError("'loss_function' must be a str")
if not isinstance(max_length, numbers.Real):
raise TypeError("'max_length' must be a real number")
if not isinstance(min_df, numbers.Real):
raise TypeError("'min_df' must be a real number")
if not isinstance(min_num_samples, numbers.Real):
raise TypeError("'min_num_samples' must be a real number")
if not isinstance(num_features, numbers.Real):
raise TypeError("'num_features' must be a real number")
if not isinstance(num_subfeatures, numbers.Real):
raise TypeError("'num_subfeatures' must be a real number")
if not isinstance(num_threads, numbers.Real):
raise TypeError("'num_threads' must be a real number")
if type(propositionalization).__name__ != "FastProp":
raise TypeError("'propositionalization' must be a FastProp")
if not isinstance(regularization, numbers.Real):
raise TypeError("'regularization' must be a real number")
if not isinstance(round_robin, bool):
raise TypeError("'round_robin' must be of type bool")
if not isinstance(sampling_factor, numbers.Real):
raise TypeError("'sampling_factor' must be a real number")
if not isinstance(seed, numbers.Real):
raise TypeError("'seed' must be a real number or None")
if not isinstance(share_aggregations, numbers.Real):
raise TypeError("'share_aggregations' must be a real number")
if not isinstance(share_conditions, numbers.Real):
raise TypeError("'share_conditions' must be a real number")
if not isinstance(shrinkage, numbers.Real):
raise TypeError("'shrinkage' must be a real number")
if not isinstance(vocab_size, numbers.Real):
raise TypeError("'vocab_size' must be a real number")
_check_parameter_bounds(vocab_size, "vocab_size", [0, np.finfo(np.float64).max])
# ----------------------------------------------------------------
if not all([aa in _multirel_subset for aa in aggregation]):
raise ValueError(
"""'aggregation' must be a list of the composed of the following
aggregations defined in getml.feature_learning.aggregations: """
+ str(_multirel_subset)
+ "."
)
_check_parameter_bounds(delta_t, "delta_t", [0.0, np.finfo(np.float64).max])
_check_parameter_bounds(
grid_factor,
"grid_factor",
[np.finfo(np.float64).resolution, np.finfo(np.float64).max],
)
_check_parameter_bounds(max_length, "max_length", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(min_df, "min_df", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(
min_num_samples, "min_num_samples", [1, np.iinfo(np.int32).max]
)
_check_parameter_bounds(num_features, "num_features", [1, np.iinfo(np.int32).max])
_check_parameter_bounds(
num_subfeatures, "num_subfeatures", [1, np.iinfo(np.int32).max]
)
_check_parameter_bounds(num_threads, "num_threads", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(regularization, "regularization", [0.0, 1.0])
_check_parameter_bounds(
sampling_factor, "sampling_factor", [0.0, np.finfo(np.float64).max]
)
_check_parameter_bounds(seed, "seed", [0.0, np.iinfo(np.uint64).max])
_check_parameter_bounds(share_aggregations, "share_aggregations", [0.0, 1.0])
_check_parameter_bounds(share_conditions, "share_conditions", [0.0, 1.0])
_check_parameter_bounds(shrinkage, "shrinkage", [0.0, 1.0])
# ----------------------------------------------------------------
if num_subfeatures > 10:
logger.info(
MIGHT_TAKE_LONG
+ """You have set num_subfeatures
to """
+ str(num_subfeatures)
+ """. The multirel algorithm
does not scale well to many columns. You should consider
using Relboost or RelboostTimeSeries instead."""
)
# --------------------------------------------------------------------
def _validate_relboost_parameters(**kwargs):
"""Checks both the types and values of the `parameters` belonging to
:class:`~getml.feature_learning.Multirel` and raises an exception if
something is off.
"""
# ----------------------------------------------------------------
if "allow_avg" in kwargs:
allow_avg = kwargs["allow_avg"]
if "allow_null_weights" in kwargs:
allow_null_weights = kwargs["allow_null_weights"]
# ----------------------------------------------------------------
delta_t = kwargs["delta_t"]
gamma = kwargs["gamma"]
loss_function = kwargs["loss_function"]
max_depth = kwargs["max_depth"]
min_df = kwargs["min_df"]
min_num_samples = kwargs["min_num_samples"]
num_features = kwargs["num_features"]
num_subfeatures = kwargs["num_subfeatures"]
num_threads = kwargs["num_threads"]
propositionalization = kwargs["propositionalization"]
reg_lambda = kwargs["reg_lambda"]
sampling_factor = kwargs["sampling_factor"]
seed = kwargs["seed"]
shrinkage = kwargs["shrinkage"]
vocab_size = kwargs["vocab_size"]
# ----------------------------------------------------------------
if "allow_avg" in kwargs and not isinstance(allow_avg, bool):
raise TypeError("'allow_avg' must be of type bool")
if "allow_null_weights" in kwargs and not isinstance(allow_null_weights, bool):
raise TypeError("'allow_null_weights' must be of type bool")
if not isinstance(delta_t, numbers.Real):
raise TypeError("'delta_t' must be a real number")
if not isinstance(gamma, numbers.Real):
raise TypeError("'gamma' must be a real number")
if not isinstance(loss_function, str) or loss_function is None:
raise TypeError("'loss_function' must be a str")
if not isinstance(max_depth, numbers.Real):
raise TypeError("'max_depth' must be a real number")
if not isinstance(min_df, numbers.Real):
raise TypeError("'min_df' must be a real number")
if not isinstance(min_num_samples, numbers.Real):
raise TypeError("'min_num_samples' must be a real number")
if not isinstance(num_features, numbers.Real):
raise TypeError("'num_features' must be a real number")
if not isinstance(num_subfeatures, numbers.Real):
raise TypeError("'num_subfeatures' must be a real number")
if not isinstance(num_threads, numbers.Real):
raise TypeError("'num_threads' must be a real number")
if type(propositionalization).__name__ != "FastProp":
raise TypeError("'propositionalization' must be a FastProp")
if not isinstance(reg_lambda, numbers.Real):
raise TypeError("'reg_lambda' must be a real number")
if not isinstance(sampling_factor, numbers.Real):
raise TypeError("'sampling_factor' must be a real number")
if not isinstance(seed, numbers.Real):
raise TypeError("'seed' must be a real number or None")
if not isinstance(shrinkage, numbers.Real):
raise TypeError("'shrinkage' must be a real number")
if not isinstance(vocab_size, numbers.Real):
raise TypeError("'vocab_size' must be a real number")
# ----------------------------------------------------------------
_check_parameter_bounds(delta_t, "delta_t", [0.0, np.finfo(np.float64).max])
_check_parameter_bounds(gamma, "gamma", [0.0, np.finfo(np.float64).max])
_check_parameter_bounds(max_depth, "max_depth", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(min_df, "min_df", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(
min_num_samples, "min_num_samples", [1, np.iinfo(np.int32).max]
)
_check_parameter_bounds(num_features, "num_features", [1, np.iinfo(np.int32).max])
_check_parameter_bounds(
num_subfeatures, "num_subfeatures", [1, np.iinfo(np.int32).max]
)
_check_parameter_bounds(num_threads, "num_threads", [0, np.iinfo(np.int32).max])
_check_parameter_bounds(reg_lambda, "reg_lambda", [0.0, np.finfo(np.float64).max])
_check_parameter_bounds(
sampling_factor, "sampling_factor", [0.0, np.finfo(np.float64).max]
)
_check_parameter_bounds(seed, "seed", [0.0, np.iinfo(np.uint64).max])
_check_parameter_bounds(shrinkage, "shrinkage", [0.0, 1.0])
_check_parameter_bounds(vocab_size, "vocab_size", [0, np.finfo(np.float64).max])
# --------------------------------------------------------------------
def _validate_time_series_parameters(**kwargs):
"""
Validates the parameters that are specific to the time series
models and raises and exception if something is off.
"""
# ----------------------------------------------------------------
horizon = kwargs["horizon"]
memory = kwargs["memory"]
self_join_keys = kwargs["self_join_keys"]
ts_name = kwargs["ts_name"]
lagged_targets = kwargs["lagged_targets"]
# ----------------------------------------------------------------
if not isinstance(horizon, numbers.Real):
raise TypeError("'horizon' must be a real number")
if not isinstance(memory, numbers.Real):
raise TypeError("'memory' must be a real number")
if not _is_typed_list(self_join_keys, str):
raise TypeError("'self_join_keys' must be list of str.")
if not isinstance(ts_name, str):
raise TypeError("'ts_name' must be a str")
if not isinstance(lagged_targets, bool):
raise TypeError("'lagged_targets' must be a bool")
# ----------------------------------------------------------------
_check_parameter_bounds(horizon, "horizon", [0.0, np.finfo(np.float64).max])
_check_parameter_bounds(memory, "memory", [0.0, np.finfo(np.float64).max])
if horizon == 0.0 and lagged_targets:
raise ValueError(
"""If your horizon is 0.0, then you cannot
lagged_targets. This is a data leak."""
)
# --------------------------------------------------------------------