From 004cf9a11d6cb69b4d770f06cbef1d579754e29e Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Thu, 8 Jan 2026 12:28:55 +0100 Subject: [PATCH] feat(quickadapter): add NGBoost regressor support with Optuna optimization (#33) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit * feat: add NGBoost regressor support with Optuna optimization - Add NGBoost to supported regressors (xgboost, lightgbm, histgradientboosting, ngboost) - Install ngboost==0.5.8 in Docker image - Implement fit_regressor branch for NGBoost with: - Dynamic distribution selection via get_ngboost_dist() helper - Support for 5 distributions: normal, lognormal, exponential, laplace, t - Early stopping support with validation set (X_val/Y_val API) - Sample weights support for training and validation - Optuna trial handling with random_state adjustment - Verbosity parameter conversion (verbosity -> verbose) - Add Optuna hyperparameter optimization support: - n_estimators: [100, 1000] (log-scaled) - learning_rate: [0.001, 0.3] (log-scaled) - minibatch_frac: [0.5, 1.0] (linear) - col_sample: [0.3, 1.0] (linear) - dist: categorical [normal, lognormal] - Space reduction support for refined optimization - Create get_ngboost_dist() helper function for distribution class mapping - Default distribution: lognormal (optimal for crypto prices) - Compatible with RMSE optimization objective (LogScore ≈ RMSE) * docs: add ngboost to regressor enum in README * fix: correct NGBoost parameter comment to reflect actual tuned parameters Removed 'tree structure' from the parameter order comment since NGBoost implementation doesn't tune tree structure parameters (only boosting, sampling, and distribution parameters are optimized via Optuna). * feat(ngboost): add tree structure parameter tuning Add DecisionTreeRegressor base learner parameters for NGBoost: - max_depth: (3, 8) based on literature and XGBoost patterns - min_samples_split: (2, 20) following sklearn best practices - min_samples_leaf: (1, 10) conservative range for crypto data These parameters are passed via the Base argument to control the underlying decision tree learners in the NGBoost ensemble. * refine(ngboost): narrow sampling and leaf hyperparameter ranges Refined Optuna search space based on gradient boosting research: - min_samples_leaf: 1-8 (was 1-10) - minibatch_frac: 0.6-1.0 (was 0.5-1.0) - col_sample: 0.4-1.0 (was 0.3-1.0) Ranges focused on empirically proven optimal zones for ensemble gradient boosting methods on financial/crypto time series data. * refactor(ngboost): move DecisionTreeRegressor import to branch start Move sklearn.tree.DecisionTreeRegressor import to the beginning of the NGBoost branch (after NGBRegressor import) for better code organization and consistency with import conventions. --- README.md | 2 +- quickadapter/Dockerfile | 2 + quickadapter/user_data/strategies/Utils.py | 140 ++++++++++++++++++++- 3 files changed, 139 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5380209..d780f1e 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ docker compose up -d --build | reversal_confirmation.min_natr_multiplier_fraction | 0.0095 | float [0,1] | Lower bound fraction for volatility adjusted reversal threshold. (Deprecated alias: `reversal_confirmation.min_natr_ratio_percent`) | | reversal_confirmation.max_natr_multiplier_fraction | 0.075 | float [0,1] | Upper bound fraction (>= lower bound) for volatility adjusted reversal threshold. (Deprecated alias: `reversal_confirmation.max_natr_ratio_percent`) | | _Regressor model_ | | | | -| freqai.regressor | `xgboost` | enum {`xgboost`,`lightgbm`,`histgradientboostingregressor`,`catboost`} | Machine learning regressor algorithm. | +| freqai.regressor | `xgboost` | enum {`xgboost`,`lightgbm`,`histgradientboostingregressor`,`ngboost`,`catboost`} | Machine learning regressor algorithm. | | _Extrema smoothing_ | | | | | freqai.extrema_smoothing.method | `gaussian` | enum {`gaussian`,`kaiser`,`triang`,`smm`,`sma`,`savgol`,`gaussian_filter1d`} | Extrema smoothing method (`smm`=median, `sma`=mean, `savgol`=Savitzky–Golay). | | freqai.extrema_smoothing.window_candles | 5 | int >= 3 | Smoothing window length (candles). (Deprecated alias: `freqai.extrema_smoothing.window`) | diff --git a/quickadapter/Dockerfile b/quickadapter/Dockerfile index 413f4cc..745d094 100644 --- a/quickadapter/Dockerfile +++ b/quickadapter/Dockerfile @@ -3,6 +3,7 @@ FROM freqtradeorg/freqtrade:stable_freqai ARG optuna_version=4.6.0 ARG scikit_learn_extra_version=0.3.0 ARG scikit_image_version=0.26.0 +ARG ngboost_version=0.5.8 ARG catboost_version=1.2.8 USER root @@ -18,6 +19,7 @@ RUN pip install --user --no-cache-dir \ -r https://hub.optuna.org/samplers/auto_sampler/requirements.txt \ scikit-learn-extra==${scikit_learn_extra_version} \ scikit-image==${scikit_image_version} \ + ngboost==${ngboost_version} \ catboost==${catboost_version} LABEL org.opencontainers.image.source="freqai-strategies" \ diff --git a/quickadapter/user_data/strategies/Utils.py b/quickadapter/user_data/strategies/Utils.py index bd74786..a4e00bf 100644 --- a/quickadapter/user_data/strategies/Utils.py +++ b/quickadapter/user_data/strategies/Utils.py @@ -1612,11 +1612,12 @@ def zigzag( ) -Regressor = Literal["xgboost", "lightgbm", "histgradientboostingregressor", "catboost"] +Regressor = Literal["xgboost", "lightgbm", "histgradientboostingregressor", "ngboost", "catboost"] REGRESSORS: Final[tuple[Regressor, ...]] = ( "xgboost", "lightgbm", "histgradientboostingregressor", + "ngboost", "catboost", ) @@ -1625,6 +1626,25 @@ RegressorCallback = Union[Callable[..., Any], XGBoostTrainingCallback] _EARLY_STOPPING_ROUNDS_DEFAULT: Final[int] = 50 +def get_ngboost_dist(dist_name: str) -> type: + from ngboost.distns import Exponential, Laplace, LogNormal, Normal, T + + dist_map = { + "normal": Normal, + "lognormal": LogNormal, + "exponential": Exponential, + "laplace": Laplace, + "t": T, + } + + if dist_name not in dist_map: + raise ValueError( + f"Invalid dist_name {dist_name!r}: supported values are {', '.join(dist_map.keys())}" + ) + + return dist_map[dist_name] + + def fit_regressor( regressor: Regressor, X: pd.DataFrame, @@ -1790,7 +1810,63 @@ def fit_regressor( y_val=y_val, sample_weight_val=sample_weight_val, ) - elif regressor == REGRESSORS[3]: # "catboost" + elif regressor == REGRESSORS[3]: # "ngboost" + from ngboost import NGBRegressor + from sklearn.tree import DecisionTreeRegressor + + model_training_parameters.setdefault("random_state", 1) + + verbosity = model_training_parameters.pop("verbosity", None) + if "verbose" not in model_training_parameters and verbosity is not None: + model_training_parameters["verbose"] = verbosity + + model_training_parameters.pop("n_jobs", None) + + early_stopping_rounds = None + if has_eval_set: + early_stopping_rounds = model_training_parameters.pop( + "early_stopping_rounds", _EARLY_STOPPING_ROUNDS_DEFAULT + ) + else: + model_training_parameters.pop("early_stopping_rounds", None) + + if trial is not None: + model_training_parameters["random_state"] = ( + model_training_parameters["random_state"] + trial.number + ) + + dist = model_training_parameters.pop("dist", "lognormal") + + X_val = None + Y_val = None + val_sample_weight = None + if has_eval_set: + X_val, Y_val = eval_set[0] + Y_val = Y_val.to_numpy().ravel() + if eval_weights is not None and len(eval_weights) > 0: + val_sample_weight = eval_weights[0] + + model = NGBRegressor( + Dist=get_ngboost_dist(dist), + Base=DecisionTreeRegressor( + criterion="friedman_mse", + max_depth=model_training_parameters.pop("max_depth", None), + min_samples_split=model_training_parameters.pop("min_samples_split", 2), + min_samples_leaf=model_training_parameters.pop("min_samples_leaf", 1), + ), + **model_training_parameters, + ) + + model.fit( + X=X, + Y=y.to_numpy().ravel(), + sample_weight=train_weights, + X_val=X_val, + Y_val=Y_val, + val_sample_weight=val_sample_weight, + early_stopping_rounds=early_stopping_rounds, + ) + elif regressor == REGRESSORS[4]: # "catboost" from catboost import CatBoostRegressor, Pool model_training_parameters.setdefault("random_seed", 1) @@ -2255,7 +2331,64 @@ def get_optuna_study_model_parameters( ), } - elif regressor == REGRESSORS[3]: # "catboost" + elif regressor == REGRESSORS[3]: # "ngboost" + # Parameter order: boosting -> tree structure -> sampling -> distribution + default_ranges: dict[str, tuple[float, float]] = { + # Boosting/Training + "n_estimators": (100, 1000), + "learning_rate": (0.001, 0.3), + # Tree structure + "max_depth": (3, 8), + "min_samples_split": (2, 20), + "min_samples_leaf": (1, 8), + # Sampling + "minibatch_frac": (0.6, 1.0), + "col_sample": (0.4, 1.0), + } + log_scaled_params = { + "n_estimators", + "learning_rate", + } + + ranges = _build_ranges(default_ranges, log_scaled_params) + + return { + # Boosting/Training + "n_estimators": _optuna_suggest_int_from_range( + trial, "n_estimators", ranges["n_estimators"], min_val=1, log=True + ), + "learning_rate": trial.suggest_float( + "learning_rate", + ranges["learning_rate"][0], + ranges["learning_rate"][1], + log=True, + ), + # Tree structure + "max_depth": _optuna_suggest_int_from_range( + trial, "max_depth", ranges["max_depth"], min_val=1 + ), + "min_samples_split": _optuna_suggest_int_from_range( + trial, "min_samples_split", ranges["min_samples_split"], min_val=2 + ), + "min_samples_leaf": _optuna_suggest_int_from_range( + trial, "min_samples_leaf", ranges["min_samples_leaf"], min_val=1 + ), + # Sampling + "minibatch_frac": trial.suggest_float( + "minibatch_frac", + ranges["minibatch_frac"][0], + ranges["minibatch_frac"][1], + ), + "col_sample": trial.suggest_float( + "col_sample", + ranges["col_sample"][0], + ranges["col_sample"][1], + ), + # Distribution + "dist": trial.suggest_categorical("dist", ["normal", "lognormal"]), + } + + elif regressor == REGRESSORS[4]: # "catboost" # Parameter order: boosting -> tree structure -> regularization -> sampling task_type = model_training_parameters.get("task_type", "CPU") if task_type == "GPU": @@ -2376,7 +2509,6 @@ def get_optuna_study_model_parameters( ) return params - else: raise ValueError( f"Invalid regressor value {regressor!r}: supported values are {', '.join(REGRESSORS)}" -- 2.43.0