From 5b51fbad058d831fd4876509bb2b4c9200d6566e Mon Sep 17 00:00:00 2001
From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= <jerome.benoit@piment-noir.org>
Date: Mon, 15 Dec 2025 22:08:46 +0100
Subject: [PATCH] refactor(ReforceXY): factor out reward factor computations
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Signed-off-by: JÃ©rÃ´me Benoit <jerome.benoit@piment-noir.org>
---
 ReforceXY/user_data/freqaimodels/ReforceXY.py | 124 +++++++++++++-----
 .../user_data/strategies/QuickAdapterV3.py    |   2 +-
 2 files changed, 94 insertions(+), 32 deletions(-)

diff --git a/ReforceXY/user_data/freqaimodels/ReforceXY.py b/ReforceXY/user_data/freqaimodels/ReforceXY.py
index ec4095c..8a73ac9 100644
--- a/ReforceXY/user_data/freqaimodels/ReforceXY.py
+++ b/ReforceXY/user_data/freqaimodels/ReforceXY.py
@@ -148,7 +148,12 @@ class ReforceXY(BaseReinforcementLearningModel):
     """
 
     _LOG_2: Final[float] = math.log(2.0)
+
     DEFAULT_IDLE_DURATION_MULTIPLIER: Final[int] = 4
+    DEFAULT_BASE_FACTOR: Final[float] = 100.0
+    DEFAULT_HOLD_POTENTIAL_SCALE: Final[float] = 1.0
+    DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0
+    DEFAULT_MAX_TRADE_DURATION_CANDLES: Final[int] = 128
 
     _MODEL_TYPES: Final[Tuple[ModelType, ...]] = (
         "PPO",
@@ -1549,13 +1554,13 @@ class MyRLEnv(Base5ActionRLEnv):
         self._total_entry_additive: float = 0.0
         self._last_exit_additive: float = 0.0
         self._total_exit_additive: float = 0.0
-        model_reward_parameters: Dict[str, Any] = self.rl_config.get(
+        model_reward_parameters: Mapping[str, Any] = self.rl_config.get(
             "model_reward_parameters", {}
         )
         self.max_trade_duration_candles: int = int(
             model_reward_parameters.get(
                 "max_trade_duration_candles",
-                128,
+                ReforceXY.DEFAULT_MAX_TRADE_DURATION_CANDLES,
             )
         )
         self.max_idle_duration_candles: int = int(
@@ -1634,7 +1639,9 @@ class MyRLEnv(Base5ActionRLEnv):
             model_reward_parameters.get("hold_potential_enabled", True)
         )
         self._hold_potential_scale: float = float(
-            model_reward_parameters.get("hold_potential_scale", 1.0)
+            model_reward_parameters.get(
+                "hold_potential_scale", ReforceXY.DEFAULT_HOLD_POTENTIAL_SCALE
+            )
         )
         self._hold_potential_gain: float = float(
             model_reward_parameters.get("hold_potential_gain", 1.0)
@@ -2310,23 +2317,19 @@ class MyRLEnv(Base5ActionRLEnv):
         self._last_exit_reward = 0.0
         return observation, history
 
-    def _get_exit_factor(
+    def _compute_time_attenuation_factor(
         self,
         factor: float,
-        pnl: float,
         duration_ratio: float,
+        model_reward_parameters: Mapping[str, Any],
     ) -> float:
         """
-        Compute the reward factor at trade exit
+        Apply time-based decay to reward factor using configurable strategy
+        (legacy/sqrt/linear/power/half_life). Optionally apply plateau grace period.
         """
-        if not (
-            np.isfinite(factor) and np.isfinite(pnl) and np.isfinite(duration_ratio)
-        ):
-            return 0.0
         if duration_ratio < 0.0:
             duration_ratio = 0.0
 
-        model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
         exit_attenuation_mode = str(
             model_reward_parameters.get(
                 "exit_attenuation_mode", ReforceXY._EXIT_ATTENUATION_MODES[2]
@@ -2338,23 +2341,20 @@ class MyRLEnv(Base5ActionRLEnv):
         )
         if exit_plateau_grace < 0.0:
             exit_plateau_grace = 0.0
-        exit_linear_slope = float(model_reward_parameters.get("exit_linear_slope", 1.0))
-        if exit_linear_slope < 0.0:
-            exit_linear_slope = 0.0
 
-        def _legacy(f: float, dr: float, p: Mapping) -> float:
+        def _legacy(f: float, dr: float, p: Mapping[str, Any]) -> float:
             return f * (1.5 if dr <= 1.0 else 0.5)
 
-        def _sqrt(f: float, dr: float, p: Mapping) -> float:
+        def _sqrt(f: float, dr: float, p: Mapping[str, Any]) -> float:
             return f / math.sqrt(1.0 + dr)
 
-        def _linear(f: float, dr: float, p: Mapping) -> float:
+        def _linear(f: float, dr: float, p: Mapping[str, Any]) -> float:
             slope = float(p.get("exit_linear_slope", 1.0))
             if slope < 0.0:
                 slope = 1.0
             return f / (1.0 + slope * dr)
 
-        def _power(f: float, dr: float, p: Mapping) -> float:
+        def _power(f: float, dr: float, p: Mapping[str, Any]) -> float:
             tau = p.get("exit_power_tau")
             if isinstance(tau, (int, float)):
                 tau = float(tau)
@@ -2366,13 +2366,13 @@ class MyRLEnv(Base5ActionRLEnv):
                 alpha = 1.0
             return f / math.pow(1.0 + dr, alpha)
 
-        def _half_life(f: float, dr: float, p: Mapping) -> float:
+        def _half_life(f: float, dr: float, p: Mapping[str, Any]) -> float:
             hl = float(p.get("exit_half_life", 0.5))
             if np.isclose(hl, 0.0) or hl < 0.0:
                 return 1.0
             return f * math.pow(2.0, -dr / hl)
 
-        strategies: Dict[str, Callable[[float, float, Mapping], float]] = {
+        strategies: Dict[str, Callable[[float, float, Mapping[str, Any]], float]] = {
             ReforceXY._EXIT_ATTENUATION_MODES[0]: _legacy,
             ReforceXY._EXIT_ATTENUATION_MODES[1]: _sqrt,
             ReforceXY._EXIT_ATTENUATION_MODES[2]: _linear,
@@ -2410,7 +2410,31 @@ class MyRLEnv(Base5ActionRLEnv):
             )
             factor = _linear(factor, effective_dr, model_reward_parameters)
 
-        factor *= self._get_pnl_factor(pnl, self._pnl_target)
+        return factor
+
+    def _get_exit_factor(
+        self,
+        factor: float,
+        pnl: float,
+        duration_ratio: float,
+        model_reward_parameters: Mapping[str, Any],
+    ) -> float:
+        """
+        Compute exit reward factor combining time attenuation and PnL factors
+        """
+        if not (
+            np.isfinite(factor) and np.isfinite(pnl) and np.isfinite(duration_ratio)
+        ):
+            return 0.0
+        time_attenuation_factor = self._compute_time_attenuation_factor(
+            factor,
+            duration_ratio,
+            model_reward_parameters,
+        )
+
+        factor *= time_attenuation_factor * self._get_pnl_factor(
+            pnl, self._pnl_target, model_reward_parameters
+        )
 
         check_invariants = model_reward_parameters.get("check_invariants", True)
         check_invariants = (
@@ -2441,16 +2465,18 @@ class MyRLEnv(Base5ActionRLEnv):
 
         return factor
 
-    def _get_pnl_factor(self, pnl: float, pnl_target: float) -> float:
-        if not np.isfinite(pnl):
-            return 0.0
-
-        model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
-
+    def _compute_pnl_target_factor(
+        self, pnl: float, pnl_target: float, model_reward_parameters: Mapping[str, Any]
+    ) -> float:
+        """
+        Scale reward based on PnL/target ratio using tanh (â¥ 1.0 for good trades).
+        """
         pnl_target_factor = 1.0
+
         if pnl_target > 0.0:
             pnl_factor_beta = float(model_reward_parameters.get("pnl_factor_beta", 0.5))
             pnl_ratio = pnl / pnl_target
+
             if abs(pnl_ratio) > 1.0:
                 base_pnl_target_factor = math.tanh(
                     pnl_factor_beta * (abs(pnl_ratio) - 1.0)
@@ -2458,6 +2484,7 @@ class MyRLEnv(Base5ActionRLEnv):
                 win_reward_factor = float(
                     model_reward_parameters.get("win_reward_factor", 2.0)
                 )
+
                 if pnl_ratio > 1.0:
                     pnl_target_factor = 1.0 + win_reward_factor * base_pnl_target_factor
                 elif pnl_ratio < -(1.0 / self.rr):
@@ -2466,9 +2493,22 @@ class MyRLEnv(Base5ActionRLEnv):
                         1.0 + loss_penalty_factor * base_pnl_target_factor
                     )
 
-        efficiency_factor = 1.0
-        efficiency_weight = float(model_reward_parameters.get("efficiency_weight", 1.0))
+        return pnl_target_factor
+
+    def _compute_efficiency_factor(
+        self, pnl: float, model_reward_parameters: Mapping[str, Any]
+    ) -> float:
+        """
+        Scale reward based on exit efficiency (distance from max unrealized PnL).
+        """
+        efficiency_weight = float(
+            model_reward_parameters.get(
+                "efficiency_weight", ReforceXY.DEFAULT_EFFICIENCY_WEIGHT
+            )
+        )
         efficiency_center = float(model_reward_parameters.get("efficiency_center", 0.5))
+
+        efficiency_factor = 1.0
         if efficiency_weight != 0.0 and not np.isclose(pnl, 0.0):
             max_pnl = max(self.get_max_unrealized_profit(), pnl)
             min_pnl = min(self.get_min_unrealized_profit(), pnl)
@@ -2484,6 +2524,24 @@ class MyRLEnv(Base5ActionRLEnv):
                         efficiency_center - efficiency_ratio
                     )
 
+        return efficiency_factor
+
+    def _get_pnl_factor(
+        self, pnl: float, pnl_target: float, model_reward_parameters: Mapping[str, Any]
+    ) -> float:
+        """
+        Combine PnL target and efficiency factors (>= 0.0)
+        """
+        if not np.isfinite(pnl):
+            return 0.0
+
+        pnl_target_factor = self._compute_pnl_target_factor(
+            pnl, pnl_target, model_reward_parameters
+        )
+        efficiency_factor = self._compute_efficiency_factor(
+            pnl, model_reward_parameters
+        )
+
         return max(0.0, pnl_target_factor * efficiency_factor)
 
     def calculate_reward(self, action: int) -> float:
@@ -2583,14 +2641,18 @@ class MyRLEnv(Base5ActionRLEnv):
             and action == Actions.Long_exit.value
             and self._position == Positions.Long
         ):
-            base_reward = pnl * self._get_exit_factor(base_factor, pnl, duration_ratio)
+            base_reward = pnl * self._get_exit_factor(
+                base_factor, pnl, duration_ratio, model_reward_parameters
+            )
             self._last_exit_reward = float(base_reward)
         if (
             base_reward is None
             and action == Actions.Short_exit.value
             and self._position == Positions.Short
         ):
-            base_reward = pnl * self._get_exit_factor(base_factor, pnl, duration_ratio)
+            base_reward = pnl * self._get_exit_factor(
+                base_factor, pnl, duration_ratio, model_reward_parameters
+            )
             self._last_exit_reward = float(base_reward)
 
         # 5. Default
diff --git a/quickadapter/user_data/strategies/QuickAdapterV3.py b/quickadapter/user_data/strategies/QuickAdapterV3.py
index ca1d158..1d5f4ad 100644
--- a/quickadapter/user_data/strategies/QuickAdapterV3.py
+++ b/quickadapter/user_data/strategies/QuickAdapterV3.py
@@ -1003,10 +1003,10 @@ class QuickAdapterV3(IStrategy):
                 f"{pair}: no extrema to label (label_period={QuickAdapterV3._td_format(label_period)} / {label_period_candles=} / {label_natr_ratio=:.2f})"
             )
         else:
-            dataframe.loc[pivots_indices, EXTREMA_COLUMN] = pivots_directions
             logger.info(
                 f"{pair}: labeled {len(pivots_indices)} extrema (label_period={QuickAdapterV3._td_format(label_period)} / {label_period_candles=} / {label_natr_ratio=:.2f})"
             )
+            dataframe.loc[pivots_indices, EXTREMA_COLUMN] = pivots_directions
 
         weighted_extrema, _ = get_weighted_extrema(
             extrema=dataframe[EXTREMA_COLUMN],
-- 
2.43.0