feat(weights): per-label sample weights propagated to model.fit(sample_weight=.....

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Sun, 24 May 2026 23:09:27 +0000 (01:09 +0200)

committer GitHub <noreply@github.com>

Sun, 24 May 2026 23:09:27 +0000 (01:09 +0200)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Sun, 24 May 2026 23:09:27 +0000 (01:09 +0200)
committer GitHub <noreply@github.com>
Sun, 24 May 2026 23:09:27 +0000 (01:09 +0200)
diff --git a/quickadapter/user_data/config-template.json b/quickadapter/user_data/config-template.json

index 0cfff0aa3e6fe53b6498a4d9892a82360682b8e7..08aa8ee4dd11e9e1be1d0eb092b1c9a0a4450f08 100644 (file)
--- a/quickadapter/user_data/config-template.json
+++ b/quickadapter/user_data/config-template.json
@@ -109,6 +109,10 @@
        //   }
        // }
      },
+    "sample_weighting": {
+      "aggregation": "arithmetic_mean",
+      "softmax_temperature": 1.0
+    },
      "label_smoothing": {
        "method": "kaiser",
        "window_candles": 5,
@@ -190,7 +194,7 @@
        "indicator_periods_candles": [8, 16, 32],
        "inlier_metric_window": 0,
        "noise_standard_deviation": 0.02,
-      "reverse_test_train_order": false,
+      "reverse_train_test_order": false,
        "plot_feature_importances": 0,
        "buffer_train_data_candles": 100
      },
diff --git a/quickadapter/user_data/freqaimodels/QuickAdapterRegressorV3.py b/quickadapter/user_data/freqaimodels/QuickAdapterRegressorV3.py

index a9aebce4fa6fe98381ebcca6118832970bfddfd8..60d713faf9a073673fcb04a63d8a204d955b0e65 100644 (file)
--- a/quickadapter/user_data/freqaimodels/QuickAdapterRegressorV3.py
+++ b/quickadapter/user_data/freqaimodels/QuickAdapterRegressorV3.py
@@ -3,6 +3,7 @@ import logging
  import random
  import time
  import warnings
+from collections import Counter
  from functools import lru_cache
  from pathlib import Path
  from typing import AbstractSet, Any, Callable, Final, Literal, Optional, Union, cast
@@ -21,7 +22,7 @@ from freqtrade.freqai.base_models.BaseRegressionModel import BaseRegressionModel
  from freqtrade.freqai.data_kitchen import FreqaiDataKitchen
  from numpy.typing import NDArray
  from optuna.study.study import ObjectiveFuncType
-from sklearn.model_selection import TimeSeriesSplit
+from sklearn.model_selection import TimeSeriesSplit, train_test_split
  from sklearn.preprocessing import (
      MaxAbsScaler,
      MinMaxScaler,
@@ -50,19 +51,23 @@ from Utils import (
      LABEL_COLUMNS,
      REGRESSORS,
      Regressor,
+    compose_sample_weights,
      ensure_datetime_series,
-    eval_set_and_weights,
+    make_test_set_and_weights,
      fit_regressor,
      format_dict,
      format_number,
      get_label_defaults,
      get_label_pipeline_config,
      get_label_prediction_config,
+    get_sample_weighting_config,
      get_min_max_label_period_candles,
      get_optuna_study_model_parameters,
+    label_weight_column_name,
      migrate_config,
      optuna_load_best_params,
      optuna_save_best_params,
+    sanitize_and_renormalize,
      soft_extremum,
      zigzag,
  )
@@ -76,6 +81,7 @@ ClusterMethod = Literal["kmeans", "kmeans2", "kmedoids"]
  DensityMethod = Literal["knn", "medoid"]
  SelectionMethod = Union[DistanceMethod, ClusterMethod, DensityMethod]
  ValidationMode = Literal["warn", "raise", "none"]
+SplitFn = Callable[[pd.DataFrame, pd.DataFrame, NDArray[np.floating]], dict[str, Any]]
  warnings.simplefilter(action="ignore", category=FutureWarning)
  
  logger = logging.getLogger(__name__)
@@ -98,10 +104,14 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
      https://github.com/sponsors/robcaulk
      """
  
-    version = "3.11.8"
+    version = "3.11.9"
  
      _TEST_SIZE: Final[float] = 0.1
  
+    _SKLEARN_TRAIN_TEST_SPLIT_KEYS: Final[frozenset[str]] = frozenset(
+        {"test_size", "train_size", "random_state", "shuffle", "stratify"}
+    )
+
      _SQRT_2: Final[float] = np.sqrt(2.0)
  
      _OPTUNA_LABEL_N_OBJECTIVES: Final[int] = 7
@@ -318,9 +328,36 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
          return set(QuickAdapterRegressorV3._POWER_MEAN_MAP.keys())
  
      @staticmethod
-    @lru_cache(maxsize=None)
-    def _data_split_methods_set() -> set[str]:
-        return set(QuickAdapterRegressorV3._DATA_SPLIT_METHODS)
+    def _shuffle_in_unison(
+        features: pd.DataFrame,
+        labels: pd.DataFrame,
+        weights: NDArray[np.floating],
+        seed: int,
+    ) -> tuple[pd.DataFrame, pd.DataFrame, NDArray[np.floating]]:
+        features = features.sample(frac=1, random_state=seed).reset_index(drop=True)
+        labels = labels.sample(frac=1, random_state=seed).reset_index(drop=True)
+        weights = (
+            pd.DataFrame(weights)
+            .sample(frac=1, random_state=seed)
+            .reset_index(drop=True)
+            .to_numpy()[:, 0]
+        )
+        return features, labels, weights
+
+    @staticmethod
+    def _coerce_int(value: Any, name: str, *, minimum: int) -> int:
+        if isinstance(value, bool) or not isinstance(value, int) or value < minimum:
+            raise ValueError(
+                f"Invalid data_split_parameters.{name} value {value!r}: "
+                f"must be int >= {minimum}"
+            )
+        return value
+
+    @staticmethod
+    def _coerce_optional_int(value: Any, name: str, *, minimum: int) -> Optional[int]:
+        if value is None:
+            return None
+        return QuickAdapterRegressorV3._coerce_int(value, name, minimum=minimum)
  
      @staticmethod
      def _get_selection_category(method: str) -> Optional[str]:
@@ -1334,85 +1371,269 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
      def train(
          self, unfiltered_df: pd.DataFrame, pair: str, dk: FreqaiDataKitchen, **kwargs
      ) -> Any:
-        """
-        Filter the training data and train a model to it.
-
-        Supports two data split methods:
-        - 'train_test_split' (default): Delegates to BaseRegressionModel.train()
-        - 'timeseries_split': Chronological split with configurable gap. Uses the final
-          fold from sklearn's TimeSeriesSplit.
-
-        :param unfiltered_df: Full dataframe for the current training period
-        :param pair: Trading pair being trained
-        :param dk: FreqaiDataKitchen object containing configuration
-        :return: Trained model
+        """Train a model with per-row sample weights.
+
+        Dispatches on ``data_split_parameters.method``:
+        - ``train_test_split``: random sklearn split.
+        - ``timeseries_split``: chronological final-fold split.
+        Both paths compose per-row weights via ``_compose_per_row_weights``
+        before splitting and feed them to ``model.fit(sample_weight=...)``
+        through ``_train_common``. Train and test weights are renormalized
+        to mean=1 after ``feature_pipeline.fit_transform`` to preserve the
+        invariant despite pipeline-level row drops.
          """
          method = self.data_split_parameters.get(
              "method", QuickAdapterRegressorV3.DATA_SPLIT_METHOD_DEFAULT
          )
  
-        if method not in QuickAdapterRegressorV3._data_split_methods_set():
+        match method:
+            case "train_test_split":
+                split_builder = self._make_train_test_split_datasets
+            case "timeseries_split":
+                split_builder = self._make_timeseries_split_datasets
+            case _:
+                raise ValueError(
+                    f"Invalid data_split_parameters.method value {method!r}: "
+                    f"supported values are "
+                    f"{', '.join(QuickAdapterRegressorV3._DATA_SPLIT_METHODS)}"
+                )
+
+        def split_fn(
+            features: pd.DataFrame,
+            labels: pd.DataFrame,
+            weights: NDArray[np.floating],
+        ) -> dict[str, Any]:
+            return split_builder(features, labels, weights, dk)
+
+        weight_col_counts = Counter(
+            label_weight_column_name(label) for label in dk.label_list
+        )
+        duplicates = {col: n for col, n in weight_col_counts.items() if n > 1}
+        if duplicates:
              raise ValueError(
-                f"Invalid data_split_parameters.method value {method!r}: "
-                f"supported values are {', '.join(QuickAdapterRegressorV3._DATA_SPLIT_METHODS)}"
+                f"Duplicate weight column names {duplicates!r} from labels "
+                f"{dk.label_list}: each label must produce a unique weight_column_name"
              )
  
          logger.info(f"Using data split method: {method}")
+        return self._train_common(unfiltered_df, pair, dk, split_fn, **kwargs)
  
-        if method == QuickAdapterRegressorV3.DATA_SPLIT_METHOD_DEFAULT:
-            return super().train(unfiltered_df, pair, dk, **kwargs)
-
-        elif (
-            method == QuickAdapterRegressorV3._DATA_SPLIT_METHODS[1]
-        ):  # timeseries_split
-            logger.info(
-                f"-------------------- Starting training {pair} --------------------"
+    def _make_train_test_split_datasets(
+        self,
+        features: pd.DataFrame,
+        labels: pd.DataFrame,
+        weights: NDArray[np.floating],
+        dk: FreqaiDataKitchen,
+    ) -> dict[str, Any]:
+        """Train/test split via sklearn's ``train_test_split``.
+
+        Routes ``data_split_parameters`` to sklearn through a whitelist of
+        sklearn-recognized keys; project-custom keys (``method``,
+        ``n_splits``, ``gap``, ``max_train_size``) are filtered out.
+        ``shuffle`` and ``test_size`` default to ``False`` and ``_TEST_SIZE``
+        respectively when absent from ``data_split_parameters``. Honors
+        ``feature_parameters.shuffle_after_split`` (deterministic when
+        ``random_state`` is set) and ``feature_parameters.reverse_train_test_order``.
+        Per-row sample weights are sliced positionally and propagate to both
+        train and test sets.
+        """
+        feat_dict = self.freqai_info.get("feature_parameters", {})
+        dsp = dict(self.data_split_parameters)
+        dsp.setdefault("shuffle", False)
+        dsp.setdefault("test_size", QuickAdapterRegressorV3._TEST_SIZE)
+        sklearn_kwargs = {
+            k: v
+            for k, v in dsp.items()
+            if k in QuickAdapterRegressorV3._SKLEARN_TRAIN_TEST_SPLIT_KEYS
+        }
+        test_size = dsp["test_size"]
+        if isinstance(test_size, bool) or not isinstance(test_size, (int, float)):
+            raise ValueError(
+                f"Invalid data_split_parameters.test_size value {test_size!r}: "
+                f"must be int or float"
+            )
+
+        if test_size != 0:
+            (
+                train_features,
+                test_features,
+                train_labels,
+                test_labels,
+                train_weights,
+                test_weights,
+            ) = train_test_split(features, labels, weights, **sklearn_kwargs)
+        else:
+            train_features = features
+            train_labels = labels
+            train_weights = weights
+            test_features = features.iloc[:0]
+            test_labels = labels.iloc[:0]
+            test_weights = weights[:0]
+
+        if feat_dict.get("shuffle_after_split", False):
+            parent_seed = sklearn_kwargs.get("random_state")
+            shuffle_rng = (
+                random.Random(parent_seed)
+                if parent_seed is not None
+                else random.Random()
+            )
+            train_features, train_labels, train_weights = (
+                QuickAdapterRegressorV3._shuffle_in_unison(
+                    train_features,
+                    train_labels,
+                    train_weights,
+                    shuffle_rng.randint(0, 2**31 - 1),
+                )
              )
+            if test_size != 0:
+                test_features, test_labels, test_weights = (
+                    QuickAdapterRegressorV3._shuffle_in_unison(
+                        test_features,
+                        test_labels,
+                        test_weights,
+                        shuffle_rng.randint(0, 2**31 - 1),
+                    )
+                )
  
-            start_time = time.time()
+        train_weights = sanitize_and_renormalize(train_weights)
+        if test_size != 0:
+            test_weights = sanitize_and_renormalize(test_weights)
  
-            features_filtered, labels_filtered = dk.filter_features(
-                unfiltered_df,
-                dk.training_features_list,
-                dk.label_list,
-                training_filter=True,
+        if feat_dict.get("reverse_train_test_order", False):
+            return dk.build_data_dictionary(
+                test_features,
+                train_features,
+                test_labels,
+                train_labels,
+                test_weights,
+                train_weights,
              )
+        return dk.build_data_dictionary(
+            train_features,
+            test_features,
+            train_labels,
+            test_labels,
+            train_weights,
+            test_weights,
+        )
  
-            dates = ensure_datetime_series(unfiltered_df["date"])
-            start_date = dates.iloc[0].strftime("%Y-%m-%d")
-            end_date = dates.iloc[-1].strftime("%Y-%m-%d")
-            logger.info(
-                f"-------------------- Training on data from {start_date} to "
-                f"{end_date} --------------------"
+    def _compose_per_row_weights(
+        self,
+        features_filtered: pd.DataFrame,
+        unfiltered_df: pd.DataFrame,
+        dk: FreqaiDataKitchen,
+    ) -> NDArray[np.floating]:
+        """Build a per-row sample weight vector aligned to features_filtered.index.
+
+        Composes freqtrade's temporal recency weight with the configured
+        per-label aggregation (default ``arithmetic_mean``) of every
+        per-target weight column present on ``unfiltered_df``. Alignment
+        runs before any shuffle/split on ``features_filtered.index``
+        (a subset of ``unfiltered_df.index``) to avoid post-hoc reindex
+        against shuffled data. Iterates ``dk.label_list`` and only includes
+        labels whose ``label_weight_column_name(label)`` exists on
+        ``unfiltered_df``.
+        """
+        if not unfiltered_df.index.is_unique:
+            raise ValueError(
+                "unfiltered_df.index must be unique for label-based weight "
+                "alignment; received non-unique index"
              )
-
-            dd = self._make_timeseries_split_datasets(
-                features_filtered, labels_filtered, dk
+        if not features_filtered.index.isin(unfiltered_df.index).all():
+            raise ValueError(
+                "features_filtered.index must be a subset of "
+                "unfiltered_df.index (filter_features should preserve original "
+                "row labels)"
              )
-
-            if (
-                not self.freqai_info.get("fit_live_predictions_candles", 0)
-                or not self.live
-            ):
-                dk.fit_labels()
-
-            dd = self._apply_pipelines(dd, dk, pair)
-
-            logger.info(
-                f"Training model on {len(dd['train_features'].columns)} features"
+        n_rows = len(features_filtered)
+        feat_dict = self.freqai_info.get("feature_parameters", {})
+        weight_factor = feat_dict.get("weight_factor", 0)
+        if (
+            not isinstance(weight_factor, bool)
+            and isinstance(weight_factor, (int, float))
+            and weight_factor > 0
+        ):
+            temporal = np.asarray(dk.set_weights_higher_recent(n_rows), dtype=float)
+        else:
+            temporal = np.ones(n_rows, dtype=float)
+
+        per_label: dict[str, NDArray[np.floating]] = {}
+        missing: list[str] = []
+        for label in dk.label_list:
+            col = label_weight_column_name(label)
+            if col in unfiltered_df.columns:
+                per_label[label] = unfiltered_df.loc[
+                    features_filtered.index, col
+                ].to_numpy(dtype=float)
+            else:
+                missing.append(col)
+        if per_label:
+            logger.debug(
+                f"per-label weight columns active: {sorted(per_label)}"
+                + (f" (no weight column for: {sorted(missing)})" if missing else "")
              )
-            logger.info(f"Training model on {len(dd['train_features'])} data points")
-
-            model = self.fit(dd, dk, **kwargs)
-
-            end_time = time.time()
-
-            logger.info(
-                f"-------------------- Done training {pair} "
-                f"({end_time - start_time:.2f} secs) --------------------"
+        else:
+            logger.warning(
+                f"no per-label weight columns found (expected: {sorted(missing)}); "
+                f"falling back to temporal weights only"
              )
+        sample_weighting = get_sample_weighting_config(
+            self.freqai_info.get("sample_weighting", {}), logger
+        )
+        sample_weighting_default = sample_weighting["default"]
+        return compose_sample_weights(
+            temporal,
+            per_label,
+            logger=logger,
+            aggregation=sample_weighting_default["aggregation"],
+            softmax_temperature=sample_weighting_default["softmax_temperature"],
+        )
  
-            return model
+    def _train_common(
+        self,
+        unfiltered_df: pd.DataFrame,
+        pair: str,
+        dk: FreqaiDataKitchen,
+        split_fn: SplitFn,
+        **kwargs,
+    ) -> Any:
+        logger.info(
+            f"-------------------- Starting training {pair} --------------------"
+        )
+        start_time = time.time()
+        features_filtered, labels_filtered = dk.filter_features(
+            unfiltered_df,
+            dk.training_features_list,
+            dk.label_list,
+            training_filter=True,
+        )
+        weights = self._compose_per_row_weights(features_filtered, unfiltered_df, dk)
+        dates = ensure_datetime_series(unfiltered_df["date"])
+        start_date = dates.iloc[0].strftime("%Y-%m-%d")
+        end_date = dates.iloc[-1].strftime("%Y-%m-%d")
+        logger.info(
+            f"-------------------- Training on data from {start_date} to "
+            f"{end_date} --------------------"
+        )
+        dd = split_fn(features_filtered, labels_filtered, weights)
+        if not self.freqai_info.get("fit_live_predictions_candles", 0) or not self.live:
+            dk.fit_labels()
+        dd = self._apply_pipelines(dd, dk, pair)
+        if len(dd["train_features"]) != len(dd["train_weights"]):
+            raise RuntimeError(
+                f"Pipeline broke shape invariant: "
+                f"len(train_features)={len(dd['train_features'])} != "
+                f"len(train_weights)={len(dd['train_weights'])}"
+            )
+        logger.info(f"Training model on {len(dd['train_features'].columns)} features")
+        logger.info(f"Training model on {len(dd['train_features'])} data points")
+        model = self.fit(dd, dk, **kwargs)
+        end_time = time.time()
+        logger.info(
+            f"-------------------- Done training {pair} "
+            f"({end_time - start_time:.2f} secs) --------------------"
+        )
+        return model
  
      def _apply_pipelines(
          self,
@@ -1439,6 +1660,7 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
                  dd["train_features"], dd["train_labels"], dd["train_weights"]
              )
          )
+        dd["train_weights"] = sanitize_and_renormalize(dd["train_weights"])
  
          dd["train_labels"], _, _ = dk.label_pipeline.fit_transform(dd["train_labels"])
  
@@ -1488,16 +1710,16 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
                          dd["test_features"], dd["test_labels"], dd["test_weights"]
                      )
                  )
+                dd["test_weights"] = sanitize_and_renormalize(dd["test_weights"])
                  dd["test_labels"], _, _ = dk.label_pipeline.transform(dd["test_labels"])
  
-        dk.data_dictionary = dd
-
          return dd
  
      def _make_timeseries_split_datasets(
          self,
          filtered_dataframe: pd.DataFrame,
          labels: pd.DataFrame,
+        weights: NDArray[np.floating],
          dk: FreqaiDataKitchen,
      ) -> dict:
          """
@@ -1509,43 +1731,55 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
  
          :param filtered_dataframe: Feature data to split
          :param labels: Label data to split
-        :param dk: FreqaiDataKitchen instance for weight calculation and data building
+        :param weights: Pre-computed per-row sample weights aligned to
+                        filtered_dataframe rows by position; sliced via
+                        ``weights[train_idx]`` / ``weights[test_idx]``.
+        :param dk: FreqaiDataKitchen instance for data building
          :return: data_dictionary with train/test features/labels/weights
          """
-        n_splits = int(
+        feat_dict = self.freqai_info.get("feature_parameters", {})
+        if feat_dict.get("shuffle_after_split", False):
+            raise ValueError(
+                "feature_parameters.shuffle_after_split=True is incompatible "
+                "with data_split_parameters.method='timeseries_split': "
+                "chronological split must preserve temporal ordering"
+            )
+        n_splits = QuickAdapterRegressorV3._coerce_int(
              self.data_split_parameters.get(
                  "n_splits", QuickAdapterRegressorV3.TIMESERIES_N_SPLITS_DEFAULT
-            )
+            ),
+            "n_splits",
+            minimum=2,
          )
-        gap = int(
+        gap = QuickAdapterRegressorV3._coerce_int(
              self.data_split_parameters.get(
                  "gap", QuickAdapterRegressorV3.TIMESERIES_GAP_DEFAULT
-            )
+            ),
+            "gap",
+            minimum=0,
          )
-        max_train_size = self.data_split_parameters.get(
-            "max_train_size", QuickAdapterRegressorV3.TIMESERIES_MAX_TRAIN_SIZE_DEFAULT
+        max_train_size = QuickAdapterRegressorV3._coerce_optional_int(
+            self.data_split_parameters.get(
+                "max_train_size",
+                QuickAdapterRegressorV3.TIMESERIES_MAX_TRAIN_SIZE_DEFAULT,
+            ),
+            "max_train_size",
+            minimum=1,
          )
-        max_train_size = int(max_train_size) if max_train_size is not None else None
-
-        if n_splits < 2:
-            raise ValueError(
-                f"Invalid data_split_parameters.n_splits value {n_splits!r}: must be >= 2"
-            )
-        if gap < 0:
-            raise ValueError(
-                f"Invalid data_split_parameters.gap value {gap!r}: must be >= 0"
-            )
-        if max_train_size is not None and max_train_size < 1:
-            raise ValueError(
-                f"Invalid data_split_parameters.max_train_size value {max_train_size!r}: "
-                f"must be >= 1 or None"
-            )
  
          test_size = self.data_split_parameters.get("test_size", None)
          if test_size is not None:
-            if isinstance(test_size, float) and 0 < test_size < 1:
+            if (
+                not isinstance(test_size, bool)
+                and isinstance(test_size, float)
+                and 0 < test_size < 1
+            ):
                  test_size = int(len(filtered_dataframe) * test_size)
-            elif isinstance(test_size, int) and test_size >= 1:
+            elif (
+                not isinstance(test_size, bool)
+                and isinstance(test_size, int)
+                and test_size >= 1
+            ):
                  pass
              else:
                  raise ValueError(
@@ -1573,25 +1807,31 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
              max_train_size=max_train_size,
              test_size=test_size,
          )
-        train_idx: np.ndarray = np.array([])
-        test_idx: np.ndarray = np.array([])
-        for train_idx, test_idx in tscv.split(filtered_dataframe):
-            pass
+        folds = list(tscv.split(filtered_dataframe))
+        if not folds:
+            raise ValueError(
+                f"TimeSeriesSplit yielded no folds for {len(filtered_dataframe)} "
+                f"samples (n_splits={n_splits}, gap={gap}, "
+                f"max_train_size={max_train_size}, test_size={test_size})"
+            )
+        train_idx, test_idx = folds[-1]
  
          train_features = filtered_dataframe.iloc[train_idx]
          test_features = filtered_dataframe.iloc[test_idx]
          train_labels = labels.iloc[train_idx]
          test_labels = labels.iloc[test_idx]
-
-        feature_parameters = self.freqai_info.get("feature_parameters", {})
-        if feature_parameters.get("weight_factor", 0) > 0:
-            total_weights = dk.set_weights_higher_recent(len(train_idx) + len(test_idx))
-            train_weights = total_weights[: len(train_idx)]
-            test_weights = total_weights[len(train_idx) :]
-        else:
-            train_weights = np.ones(len(train_idx))
-            test_weights = np.ones(len(test_idx))
-
+        train_weights = sanitize_and_renormalize(weights[train_idx])
+        test_weights = sanitize_and_renormalize(weights[test_idx])
+
+        if feat_dict.get("reverse_train_test_order", False):
+            return dk.build_data_dictionary(
+                test_features,
+                train_features,
+                test_labels,
+                train_labels,
+                test_weights,
+                train_weights,
+            )
          return dk.build_data_dictionary(
              train_features,
              test_features,
@@ -1664,7 +1904,7 @@ class QuickAdapterRegressorV3(BaseRegressionModel):
                      **optuna_hp_params,
                  }
  
-        eval_set, eval_weights = eval_set_and_weights(
+        eval_set, eval_weights = make_test_set_and_weights(
              X_test,
              y_test,
              test_weights,
@@ -3512,7 +3752,7 @@ def hp_objective(
      )
      model_training_parameters = {**model_training_parameters, **study_model_parameters}
  
-    eval_set, eval_weights = eval_set_and_weights(
+    eval_set, eval_weights = make_test_set_and_weights(
          X_test, y_test, test_weights, test_size
      )
  
diff --git a/quickadapter/user_data/strategies/LabelTransformer.py b/quickadapter/user_data/strategies/LabelTransformer.py

index 11f93b8e9b6de4ff6a03a4824143f20121578be3..ca658d7cbb720e74b4703b7cbb8ff31c82e4d483 100644 (file)
--- a/quickadapter/user_data/strategies/LabelTransformer.py
+++ b/quickadapter/user_data/strategies/LabelTransformer.py
@@ -87,6 +87,11 @@ DEFAULTS_LABEL_WEIGHTING: Final[dict[str, Any]] = {
      "softmax_temperature": 1.0,
  }
  
+DEFAULTS_SAMPLE_WEIGHTING: Final[dict[str, Any]] = {
+    "aggregation": COMBINED_AGGREGATIONS[0],  # "arithmetic_mean"
+    "softmax_temperature": 1.0,
+}
+
  DEFAULTS_LABEL_PIPELINE: Final[dict[str, Any]] = {
      "standardization": STANDARDIZATION_TYPES[0],  # "none"
      "robust_quantiles": (0.25, 0.75),
diff --git a/quickadapter/user_data/strategies/QuickAdapterV3.py b/quickadapter/user_data/strategies/QuickAdapterV3.py

index f02eba277468075aa70477618f7bedf9b98d8744..e839555ad1c702696a1fe9426447b6b5002ad56a 100644 (file)
--- a/quickadapter/user_data/strategies/QuickAdapterV3.py
+++ b/quickadapter/user_data/strategies/QuickAdapterV3.py
@@ -29,15 +29,16 @@ from technical.pivots_points import pivots_points
  from Utils import (
      DEFAULT_FIT_LIVE_PREDICTIONS_CANDLES,
      EXTREMA_COLUMN,
+    EXTREMA_DIRECTION_COLUMN,
+    EXTREMA_DIRECTION_SMOOTHED_COLUMN,
+    EXTREMA_WEIGHT_COLUMN,
+    EXTREMA_WEIGHT_SMOOTHED_COLUMN,
      LABEL_COLUMNS,
-    MAXIMA_COLUMN,
-    MINIMA_COLUMN,
-    SMOOTHED_EXTREMA_COLUMN,
      TRADE_PRICE_TARGETS,
      alligator,
-    apply_label_weighting,
      bottom_log_return,
      calculate_quantile,
+    compute_label_weights,
      ensure_datetime_series,
      ewo,
      format_dict,
@@ -49,12 +50,13 @@ from Utils import (
      get_label_smoothing_config,
      get_label_weighting_config,
      get_zl_ma_fn,
+    label_weight_column_name,
      migrate_config,
      nan_average,
      non_zero_diff,
      optuna_load_best_params,
      price_retracement_percent,
-    smooth_label,
+    smooth,
      top_log_return,
      validate_range,
      vwapb,
@@ -106,10 +108,8 @@ class QuickAdapterV3(IStrategy):
  
      _ANNOTATION_LINE_OFFSET_CANDLES: Final[int] = 10
  
-    _PLOT_EXTREMA_MIN_EPS: Final[float] = 0.01
-
      def version(self) -> str:
-        return "3.11.8"
+        return "3.11.9"
  
      timeframe = "5m"
      timeframe_minutes = timeframe_to_minutes(timeframe)
@@ -205,10 +205,19 @@ class QuickAdapterV3(IStrategy):
                      },
                      EXTREMA_COLUMN: {"color": "orange", "type": "line"},
                  },
-                "min_max": {
-                    SMOOTHED_EXTREMA_COLUMN: {"color": "wheat", "type": "line"},
-                    MAXIMA_COLUMN: {"color": "red", "type": "bar"},
-                    MINIMA_COLUMN: {"color": "green", "type": "bar"},
+                "direction": {
+                    EXTREMA_DIRECTION_COLUMN: {"color": "wheat", "type": "line"},
+                    EXTREMA_DIRECTION_SMOOTHED_COLUMN: {
+                        "color": "orange",
+                        "type": "line",
+                    },
+                },
+                "weight": {
+                    EXTREMA_WEIGHT_COLUMN: {"color": "wheat", "type": "line"},
+                    EXTREMA_WEIGHT_SMOOTHED_COLUMN: {
+                        "color": "orange",
+                        "type": "line",
+                    },
                  },
              },
          }
@@ -826,57 +835,37 @@ class QuickAdapterV3(IStrategy):
                  label_col, label_weighting["default"], label_weighting["columns"]
              )
  
-            weighted_label, _ = apply_label_weighting(
-                label=label_data.series,
+            label_weights = compute_label_weights(
+                n_values=len(label_data.series),
                  indices=label_data.indices,
                  metrics=label_data.metrics,
                  weighting_config=col_weighting_config,
              )
  
-            dataframe[label_col] = weighted_label
+            label_weight_col = label_weight_column_name(label_col)
+
+            dataframe[label_col] = label_data.series
+            dataframe[label_weight_col] = label_weights
  
              if label_col == EXTREMA_COLUMN:
-                extrema = dataframe[label_col]
-                extrema_direction = label_data.series
-                plot_eps = extrema.abs().where(extrema.ne(0.0)).min()
-                if not np.isfinite(plot_eps):
-                    plot_eps = 0.0
-                plot_eps = max(
-                    float(plot_eps) * 0.5, QuickAdapterV3._PLOT_EXTREMA_MIN_EPS
-                )
-                dataframe[MAXIMA_COLUMN] = (
-                    extrema.where(extrema_direction.gt(0), 0.0)
-                    .clip(lower=0.0)
-                    .mask(
-                        extrema_direction.gt(0) & extrema.eq(0.0),
-                        plot_eps,
-                    )
-                )
-                dataframe[MINIMA_COLUMN] = (
-                    extrema.where(extrema_direction.lt(0), 0.0)
-                    .clip(upper=0.0)
-                    .mask(
-                        extrema_direction.lt(0) & extrema.eq(0.0),
-                        -plot_eps,
-                    )
-                )
+                dataframe[EXTREMA_DIRECTION_COLUMN] = dataframe[label_col]
+                dataframe[EXTREMA_WEIGHT_COLUMN] = dataframe[label_weight_col]
  
              col_smoothing_config = get_label_column_config(
                  label_col, label_smoothing["default"], label_smoothing["columns"]
              )
  
-            dataframe[label_col] = smooth_label(
-                dataframe[label_col],
-                col_smoothing_config["method"],
-                col_smoothing_config["window_candles"],
-                col_smoothing_config["beta"],
-                col_smoothing_config["polyorder"],
-                col_smoothing_config["mode"],
-                col_smoothing_config["sigma"],
+            dataframe[label_col] = smooth(dataframe[label_col], **col_smoothing_config)
+            smoothed_label_weights = smooth(
+                dataframe[label_weight_col], **col_smoothing_config
+            )
+            dataframe[label_weight_col] = smoothed_label_weights.where(
+                smoothed_label_weights.gt(0) & smoothed_label_weights.notna(), 0.0
              )
  
              if label_col == EXTREMA_COLUMN:
-                dataframe[SMOOTHED_EXTREMA_COLUMN] = dataframe[label_col]
+                dataframe[EXTREMA_DIRECTION_SMOOTHED_COLUMN] = dataframe[label_col]
+                dataframe[EXTREMA_WEIGHT_SMOOTHED_COLUMN] = dataframe[label_weight_col]
  
          return dataframe
  
diff --git a/quickadapter/user_data/strategies/Utils.py b/quickadapter/user_data/strategies/Utils.py

index a1def001e1f331e1d15219f061152338961d1cb1..da542dc25e94e84f848c093cfb10496921eb2a8f 100644 (file)
--- a/quickadapter/user_data/strategies/Utils.py
+++ b/quickadapter/user_data/strategies/Utils.py
@@ -3,6 +3,7 @@ import functools
  import hashlib
  import json
  import math
+import re
  from dataclasses import dataclass
  from enum import IntEnum
  from functools import lru_cache, singledispatch
@@ -31,6 +32,7 @@ from LabelTransformer import (
      DEFAULTS_LABEL_PREDICTION,
      DEFAULTS_LABEL_SMOOTHING,
      DEFAULTS_LABEL_WEIGHTING,
+    DEFAULTS_SAMPLE_WEIGHTING,
      EXTREMA_SELECTION_METHODS,
      NORMALIZATION_TYPES,
      PREDICTION_METHODS,
@@ -229,6 +231,13 @@ _SMOOTHING_SPECS: Final[dict[str, _ParamSpec]] = {
      ),
  }
  
+_SAMPLE_WEIGHTING_SPECS: Final[dict[str, _ParamSpec]] = {
+    "aggregation": _ParamSpec(_EnumValidator(COMBINED_AGGREGATIONS)),
+    "softmax_temperature": _ParamSpec(
+        _NumericValidator(min_value=0, min_exclusive=True)
+    ),
+}
+
  _PREDICTION_SPECS: Final[dict[str, _ParamSpec]] = {
      "method": _ParamSpec(_EnumValidator(PREDICTION_METHODS)),
      "selection_method": _ParamSpec(_EnumValidator(EXTREMA_SELECTION_METHODS)),
@@ -250,8 +259,45 @@ _PREDICTION_SPECS: Final[dict[str, _ParamSpec]] = {
  
  
  EXTREMA_COLUMN: Final = "&s-extrema"
+EXTREMA_DIRECTION_COLUMN: Final = "extrema_direction"
+EXTREMA_DIRECTION_SMOOTHED_COLUMN: Final = "extrema_direction_smoothed"
+EXTREMA_WEIGHT_COLUMN: Final = "extrema_weight"
+EXTREMA_WEIGHT_SMOOTHED_COLUMN: Final = "extrema_weight_smoothed"
+
+LABEL_WEIGHT_SUFFIX: Final[str] = "_weight"
+
  LABEL_COLUMNS: Final[tuple[str, ...]] = (EXTREMA_COLUMN,)
  
+_FREQAI_LABEL_SIGIL_PATTERN: Final = re.compile(r"^&-?")
+
+
+@lru_cache(maxsize=16)
+def label_weight_column_name(label_col: str) -> str:
+    """Return the weight column name for a label column.
+
+    Strips the freqtrade label sigil (``&`` and its optional immediate ``-``
+    separator) so the resulting column does NOT collide with
+    ``FreqaiDataKitchen.find_labels`` (which selects columns containing ``&``)
+    nor with ``find_features`` (which selects columns containing ``%``).
+    Preserves the project convention where a leading ``s`` denotes a smoothed
+    target series (e.g. ``&s-extrema``); no ``s`` denotes a raw target.
+    Raises ``ValueError`` if the result still contains ``&`` or ``%``.
+
+    Examples:
+        ``"&s-extrema"``      -> ``"s-extrema_weight"`` (smoothed marker preserved)
+        ``"&-amplitude"``     -> ``"amplitude_weight"`` (raw target)
+        ``"&-time_to_pivot"`` -> ``"time_to_pivot_weight"`` (raw target)
+        ``"&-natr"``          -> ``"natr_weight"`` (raw target)
+    """
+    stripped = _FREQAI_LABEL_SIGIL_PATTERN.sub("", label_col, count=1)
+    result = f"{stripped}{LABEL_WEIGHT_SUFFIX}"
+    if "&" in result or "%" in result:
+        raise ValueError(
+            f"label_weight_column_name produced collision-prone name {result!r} "
+            f"from {label_col!r}; weight columns must not contain '&' or '%'"
+        )
+    return result
+
  
  @dataclass
  class LabelData:
@@ -324,10 +370,6 @@ def generate_label_data(
      return generator(dataframe, params)
  
  
-MAXIMA_COLUMN: Final = "maxima"
-MINIMA_COLUMN: Final = "minima"
-SMOOTHED_EXTREMA_COLUMN: Final = "smoothed_extrema"
-
  SmoothingKernel = Literal["gaussian", "kaiser", "triang"]
  SMOOTHING_KERNELS: Final[tuple[SmoothingKernel, ...]] = (
      "gaussian",
@@ -622,6 +664,29 @@ def get_label_smoothing_config(
      )
  
  
+def _validate_sample_weighting_params(
+    config: dict[str, Any],
+    logger: Logger,
+    config_name: str = "sample_weighting",
+) -> dict[str, Any]:
+    return _validate_params(
+        config, logger, config_name, _SAMPLE_WEIGHTING_SPECS, DEFAULTS_SAMPLE_WEIGHTING
+    )
+
+
+def get_sample_weighting_config(
+    config: dict[str, Any],
+    logger: Logger,
+) -> dict[str, Any]:
+    return _get_label_config(
+        config,
+        logger,
+        "sample_weighting",
+        _validate_sample_weighting_params,
+        DEFAULTS_SAMPLE_WEIGHTING,
+    )
+
+
  def _validate_prediction_params(
      config: dict[str, Any],
      logger: Logger,
@@ -680,6 +745,96 @@ def midpoint(value1: T, value2: T) -> T:
      return (value1 + value2) / 2
  
  
+def sanitize_and_renormalize(
+    arr: NDArray[np.floating],
+    drop_mask: NDArray[np.bool_] | None = None,
+) -> NDArray[np.floating]:
+    arr = np.asarray(arr, dtype=float)
+    if arr.size == 0:
+        return arr
+    safe = np.where(np.isfinite(arr) & (arr > 0), arr, 0.0)
+    if drop_mask is not None:
+        safe = safe.copy()
+        safe[drop_mask] = 0.0
+    total = safe.sum()
+    if total > 0 and np.isfinite(total):
+        return safe * (len(safe) / total)
+    fallback = np.ones_like(arr)
+    if drop_mask is not None:
+        fallback[drop_mask] = 0.0
+    return fallback
+
+
+def compose_sample_weights(
+    base_weights: NDArray[np.floating],
+    label_weights_map: dict[str, NDArray[np.floating]],
+    *,
+    logger: Logger,
+    aggregation: CombinedAggregation = COMBINED_AGGREGATIONS[0],
+    softmax_temperature: float = 1.0,
+) -> NDArray[np.floating]:
+    """Combine base sample weights with per-label importance weights.
+
+    Returns w in R+^N with mean(w) == 1. Per-label arrays are sanitized
+    (non-finite or <= 0 -> row dropped), individually mean-normalized,
+    aggregated row-wise via ``aggregation`` (default arithmetic_mean),
+    multiplied with base_weights, zeroed on dropped rows, and renormalized
+    to mean=1.
+
+    Raises ValueError on shape mismatch or when every row is dropped.
+    Default-weight imputation in compute_label_weights uses full-series
+    median (bounded leakage; see AFML chapter 4).
+    """
+    base_weights = np.asarray(base_weights, dtype=float)
+    if not label_weights_map:
+        return sanitize_and_renormalize(base_weights)
+    n = len(base_weights)
+    for label, label_values in label_weights_map.items():
+        arr = np.asarray(label_values, dtype=float)
+        if arr.shape != (n,):
+            raise ValueError(
+                f"compose_sample_weights: label {label!r} has shape {arr.shape}, "
+                f"expected ({n},)"
+            )
+    normalized_per_label: list[NDArray[np.floating]] = []
+    drop_mask = np.zeros(n, dtype=bool)
+    for label_values in label_weights_map.values():
+        arr = np.asarray(label_values, dtype=float)
+        invalid = ~np.isfinite(arr) | (arr <= 0.0)
+        drop_mask |= invalid
+        arr = np.where(invalid, 1.0, np.maximum(arr, np.finfo(float).tiny))
+        normalized_per_label.append(sanitize_and_renormalize(arr))
+    if drop_mask.all():
+        raise ValueError(
+            f"compose_sample_weights: all rows dropped by per-label zero weights "
+            f"(labels={list(label_weights_map)}); no surviving training samples"
+        )
+    stacked = np.vstack(normalized_per_label)
+    agg = _aggregate_metrics(
+        stacked_metrics=stacked,
+        coefficients=np.ones(stacked.shape[0], dtype=float),
+        aggregation=aggregation,
+        softmax_temperature=softmax_temperature,
+    )
+    combined = base_weights * agg
+    combined[drop_mask] = 0.0
+    combined_sum = combined.sum()
+    if combined_sum > 0 and np.isfinite(combined_sum):
+        ratio = n / combined_sum
+        if np.isfinite(ratio):
+            scaled = combined * ratio
+            if np.all(np.isfinite(scaled)):
+                return scaled
+    logger.warning(
+        "compose_sample_weights: aggregated weights collapsed (labels=%s, "
+        "aggregation=%s, combined_sum=%r); falling back to base weights",
+        list(label_weights_map),
+        aggregation,
+        combined_sum,
+    )
+    return sanitize_and_renormalize(base_weights, drop_mask=drop_mask)
+
+
  def nan_average(
      values: NDArray[np.floating],
      weights: NDArray[np.floating] | None = None,
@@ -771,7 +926,7 @@ def zero_phase_filter(
      return pd.Series(filtered_values, index=series.index)
  
  
-def smooth_label(
+def smooth(
      series: pd.Series,
      method: SmoothingMethod = DEFAULTS_LABEL_SMOOTHING["method"],
      window_candles: int = DEFAULTS_LABEL_SMOOTHING["window_candles"],
@@ -884,7 +1039,7 @@ def _impute_weights(
      return weights
  
  
-def _build_weights_array(
+def _scatter_weights(
      n_values: int,
      indices: list[int],
      weights: NDArray[np.floating],
@@ -961,6 +1116,8 @@ def _aggregate_metrics(
              ]
          )
      elif aggregation == COMBINED_AGGREGATIONS[5]:  # "softmax"
+        # Per-column softmax-weighted convex combination of stacked rows.
+        # T -> 0 collapses to argmax row; T -> +inf collapses to coefficient-weighted mean.
          scaled_metrics = stacked_metrics / softmax_temperature
          softmax_weights = sp.special.softmax(scaled_metrics, axis=0)
          combined_weights = softmax_weights * coefficients[:, np.newaxis]
@@ -1044,7 +1201,7 @@ def compute_label_weights(
          weights=weights,
      )
  
-    return _build_weights_array(
+    return _scatter_weights(
          n_values=n_values,
          indices=indices,
          weights=weights,
@@ -1052,46 +1209,6 @@ def compute_label_weights(
      )
  
  
-def _apply_label_weights(
-    values: NDArray[np.floating], weights: NDArray[np.floating]
-) -> NDArray[np.floating]:
-    if weights.size == 0:
-        return values
-
-    if not np.isfinite(weights).all():
-        return values
-
-    if np.allclose(weights, weights[0]):
-        return values
-
-    if np.allclose(weights, DEFAULT_LABEL_WEIGHT):
-        return values
-
-    return values * weights
-
-
-def apply_label_weighting(
-    label: pd.Series,
-    indices: list[int],
-    metrics: dict[str, list[float]],
-    weighting_config: dict[str, Any],
-) -> tuple[pd.Series, pd.Series]:
-    label_values = label.to_numpy(dtype=float)
-    label_index = label.index
-    n_values = label_values.size
-
-    weights = compute_label_weights(
-        n_values=n_values,
-        indices=indices,
-        metrics=metrics,
-        weighting_config=weighting_config,
-    )
-
-    return pd.Series(
-        _apply_label_weights(label_values, weights), index=label_index
-    ), pd.Series(weights, index=label_index)
-
-
  def get_callable_sha256(fn: Callable[..., Any]) -> str:
      if not callable(fn):
          raise ValueError(f"Invalid fn value {type(fn).__name__!r}: must be callable")
@@ -2534,7 +2651,7 @@ def fit_regressor(
      return model
  
  
-def eval_set_and_weights(
+def make_test_set_and_weights(
      X_test: pd.DataFrame,
      y_test: pd.DataFrame,
      test_weights: NDArray[np.floating],
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Sun, 24 May 2026 23:09:27 +0000 (01:09 +0200)
committer	GitHub <noreply@github.com>
	Sun, 24 May 2026 23:09:27 +0000 (01:09 +0200)
quickadapter/user_data/config-template.json		patch \| blob \| blame \| history
quickadapter/user_data/freqaimodels/QuickAdapterRegressorV3.py		patch \| blob \| blame \| history
quickadapter/user_data/strategies/LabelTransformer.py		patch \| blob \| blame \| history
quickadapter/user_data/strategies/QuickAdapterV3.py		patch \| blob \| blame \| history
quickadapter/user_data/strategies/Utils.py		patch \| blob \| blame \| history