]> Piment Noir Git Repositories - freqai-strategies.git/commitdiff
perf(reforcexy): fine tune default reward settings
authorJérôme Benoit <jerome.benoit@piment-noir.org>
Mon, 6 Oct 2025 18:04:13 +0000 (20:04 +0200)
committerJérôme Benoit <jerome.benoit@piment-noir.org>
Mon, 6 Oct 2025 18:04:13 +0000 (20:04 +0200)
Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
ReforceXY/reward_space_analysis/README.md
ReforceXY/reward_space_analysis/reward_space_analysis.py
ReforceXY/reward_space_analysis/test_reward_space_analysis.py
ReforceXY/user_data/freqaimodels/ReforceXY.py

index 32bbd1334289a6a4bc37773e3418c72a3b48f2d1..ce5a3be7092c13bc20b19f24bd01529286ced6e8 100644 (file)
@@ -149,7 +149,7 @@ None - all parameters have sensible defaults.
 
 - Maximum trade duration in candles (from environment config)
 - Should match your actual trading environment setting
-- Also used as fallback for `max_idle_duration_candles` when that tunable is ≤ 0 (idle penalty grace behaviour)
+- Drives idle grace: when `max_idle_duration_candles` ≤ 0 the fallback = `2 * max_trade_duration`
 
 ### Reward Configuration
 
index 01a9834cf9665baddeb40820b24441ec221e3a45..e671f53ceb6b5462bf5d47a8bdc69e3f453ef2a2 100644 (file)
@@ -130,7 +130,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: Dict[str, float | str] = {
     # Idle penalty (env defaults)
     "idle_penalty_power": 1.0,
     "idle_penalty_scale": 0.75,
-    # If <=0 or unset, falls back to max_trade_duration_candles at runtime
+    # Fallback semantics: if <=0 or unset → 2 * max_trade_duration_candles (grace window before full idle penalty)
     "max_idle_duration_candles": 0,
     # Holding keys (env defaults)
     "holding_penalty_scale": 0.5,
@@ -158,7 +158,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = {
     "base_factor": "Base reward factor used inside the environment.",
     "idle_penalty_power": "Power applied to idle penalty scaling.",
     "idle_penalty_scale": "Scale of idle penalty.",
-    "max_idle_duration_candles": "Maximum idle duration candles before full idle penalty scaling; 0 = use max_trade_duration_candles.",
+    "max_idle_duration_candles": "Maximum idle duration candles before full idle penalty scaling; 0 = use 2 * max_trade_duration_candles.",
     "holding_penalty_scale": "Scale of holding penalty.",
     "holding_penalty_power": "Power applied to holding penalty scaling.",
     "exit_factor_mode": "Time attenuation mode for exit factor.",
@@ -489,18 +489,18 @@ def _idle_penalty(
     """Mirror the environment's idle penalty behaviour."""
     idle_penalty_scale = _get_param_float(params, "idle_penalty_scale", 0.75)
     idle_penalty_power = _get_param_float(params, "idle_penalty_power", 1.0)
-    max_trade_duration = int(params.get("max_trade_duration_candles", 128))
+    max_trade_duration_candles = params.get("max_trade_duration_candles", 128)
     max_idle_duration_candles = params.get("max_idle_duration_candles")
     try:
         max_idle_duration = (
             int(max_idle_duration_candles)
             if max_idle_duration_candles is not None
-            else max_trade_duration
+            else 2 * max_trade_duration_candles
         )
     except (TypeError, ValueError):
-        max_idle_duration = max_trade_duration
+        max_idle_duration = max_trade_duration_candles
     if max_idle_duration <= 0:
-        max_idle_duration = max_trade_duration
+        max_idle_duration = 2 * max_trade_duration_candles
     idle_duration_ratio = context.idle_duration / max(1, max_idle_duration)
     return -idle_factor * idle_penalty_scale * idle_duration_ratio**idle_penalty_power
 
index 0fddd4c41fa8659a7bbb3fabbc405ffdda751c16..86a98d276e326013102c9eb282ad7337927cdefa 100644 (file)
@@ -461,7 +461,7 @@ class TestRewardAlignment(RewardSpaceTestBase):
         )
 
     def test_max_idle_duration_candles_logic(self):
-        """Idle penalty scaling test with explicit max_idle_duration_candles (no force-action comparisons)."""
+        """Idle penalty scaling test with explicit max_idle_duration_candles."""
         params_small = self.DEFAULT_PARAMS.copy()
         params_large = self.DEFAULT_PARAMS.copy()
         # Activate explicit max idle durations
@@ -511,9 +511,13 @@ class TestRewardAlignment(RewardSpaceTestBase):
         )
 
     def test_idle_penalty_fallback_and_proportionality(self):
-        """When max_idle_duration_candles <= 0, fallback to max_trade_duration and ensure proportional scaling.
+        """Fallback & proportionality validation.
 
-        Also validates that penalty doubles (approximately) when idle_duration doubles (holding other params constant).
+        Semantics:
+        - When max_idle_duration_candles <= 0, fallback must be 2 * max_trade_duration (updated rule).
+        - Idle penalty scales ~ linearly with idle_duration (power=1), so doubling idle_duration doubles penalty magnitude.
+        - We also infer the implicit denominator from a mid-range idle duration (>1x and <2x trade duration) to ensure the
+          2x fallback.
         """
         params = self.DEFAULT_PARAMS.copy()
         params["max_idle_duration_candles"] = 0  # force fallback
@@ -569,6 +573,32 @@ class TestRewardAlignment(RewardSpaceTestBase):
             tolerance=0.2,
             msg=f"Idle penalty proportionality mismatch (ratio={ratio})",
         )
+        # Additional mid-range inference check (idle_duration between 1x and 2x trade duration)
+        ctx_mid = dataclasses.replace(ctx_a, idle_duration=120, max_trade_duration=100)
+        br_mid = calculate_reward(
+            ctx_mid,
+            params,
+            base_factor=base_factor,
+            profit_target=profit_target,
+            risk_reward_ratio=risk_reward_ratio,
+            short_allowed=True,
+            action_masking=True,
+        )
+        self.assertLess(br_mid.idle_penalty, 0.0)
+        idle_penalty_scale = float(params.get("idle_penalty_scale", 0.75))
+        idle_penalty_power = float(params.get("idle_penalty_power", 1.0))
+        # Internal factor may come from params (overrides provided base_factor argument)
+        factor_used = float(params.get("base_factor", base_factor))
+        idle_factor = factor_used * (profit_target * risk_reward_ratio) / 3.0
+        observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_scale)
+        if observed_ratio > 0:
+            implied_D = 120 / (observed_ratio ** (1 / idle_penalty_power))
+            self.assertAlmostEqualFloat(
+                implied_D,
+                200.0,
+                tolerance=12.0,  # modest tolerance for float ops / rounding
+                msg=f"Fallback denominator mismatch (implied={implied_D}, expected≈200, factor_used={factor_used})",
+            )
 
     def test_exit_factor_threshold_warning_non_capping(self):
         """Ensure exit_factor_threshold does not cap the exit factor (warning-only semantics).
index e988339447bbdaf8e7289135af7ada398a7646a9..3f3e055b087292974e078d2b93646bec243b6984 100644 (file)
@@ -1392,9 +1392,7 @@ class MyRLEnv(Base5ActionRLEnv):
             duration_ratio = 0.0
 
         model_reward_parameters = self.rl_config.get("model_reward_parameters", {})
-        exit_factor_mode = str(
-            model_reward_parameters.get("exit_factor_mode", "piecewise")
-        ).lower()
+        exit_factor_mode = model_reward_parameters.get("exit_factor_mode", "piecewise")
 
         def _legacy(f: float, dr: float, p: Mapping) -> float:
             return f * (1.5 if dr <= 1.0 else 0.5)
@@ -1599,11 +1597,11 @@ class MyRLEnv(Base5ActionRLEnv):
         if action == Actions.Neutral.value and self._position == Positions.Neutral:
             max_idle_duration = int(
                 model_reward_parameters.get(
-                    "max_idle_duration_candles", max_trade_duration
+                    "max_idle_duration_candles", 2 * max_trade_duration
                 )
             )
             if max_idle_duration <= 0:
-                max_idle_duration = max_trade_duration
+                max_idle_duration = 2 * max_trade_duration
             idle_penalty_scale = float(
                 model_reward_parameters.get("idle_penalty_scale", 0.75)
             )