]> Piment Noir Git Repositories - freqai-strategies.git/commitdiff
refactor(ReforceXY): sensible defaults for risk/reward ratio and hold potential main
authorJérôme Benoit <jerome.benoit@piment-noir.org>
Thu, 25 Dec 2025 23:11:19 +0000 (00:11 +0100)
committerJérôme Benoit <jerome.benoit@piment-noir.org>
Thu, 25 Dec 2025 23:11:19 +0000 (00:11 +0100)
Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
ReforceXY/reward_space_analysis/README.md
ReforceXY/reward_space_analysis/reward_space_analysis.py
ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
ReforceXY/reward_space_analysis/tests/constants.py
ReforceXY/reward_space_analysis/tests/helpers/assertions.py
ReforceXY/reward_space_analysis/tests/helpers/test_internal_branches.py
ReforceXY/user_data/freqaimodels/ReforceXY.py

index 6e74cf0816567c2c790b55cc43a0c90eee1d2b67..b830dfca813b4b5c9dfcbd8f559c68e9d44b7f9b 100644 (file)
@@ -176,7 +176,7 @@ be overridden via `--params`.
 
 - **`--profit_aim`** (float, default: 0.03) – Profit target threshold (e.g.
   0.03=3%).
-- **`--risk_reward_ratio`** (float, default: 1.0) – Risk-reward multiplier.
+- **`--risk_reward_ratio`** (float, default: 2.0) – Risk-reward multiplier.
 - **`--action_masking`** (bool, default: true) – Simulate environment action
   masking. Invalid actions receive penalties only if masking disabled.
 
@@ -240,8 +240,8 @@ The exit factor is computed as:
 | Parameter           | Default | Description                   |
 | ------------------- | ------- | ----------------------------- |
 | `profit_aim`        | 0.03    | Profit target threshold       |
-| `risk_reward_ratio` | 1.0     | Risk/reward multiplier        |
-| `win_reward_factor` | 2.0     | Profit overshoot bonus factor |
+| `risk_reward_ratio` | 2.0     | Risk/reward multiplier        |
+| `win_reward_factor` | 2.0     | Profit target bonus factor    |
 | `pnl_factor_beta`   | 0.5     | PnL amplification sensitivity |
 
 **Note:** In ReforceXY, `risk_reward_ratio` maps to `rr`.
@@ -332,12 +332,12 @@ across samples) and does not apply any drift correction in post-processing.
 
 #### Hold Potential Transforms
 
-| Parameter                           | Default | Description          |
-| ----------------------------------- | ------- | -------------------- |
-| `hold_potential_ratio`              | 0.25    | Hold potential ratio |
-| `hold_potential_gain`               | 1.0     | Gain multiplier      |
-| `hold_potential_transform_pnl`      | tanh    | PnL transform        |
-| `hold_potential_transform_duration` | tanh    | Duration transform   |
+| Parameter                           | Default  | Description          |
+| ----------------------------------- | -------- | -------------------- |
+| `hold_potential_ratio`              | 0.015625 | Hold potential ratio |
+| `hold_potential_gain`               | 1.0      | Gain multiplier      |
+| `hold_potential_transform_pnl`      | tanh     | PnL transform        |
+| `hold_potential_transform_duration` | tanh     | Duration transform   |
 
 **Hold Potential Formula:**
 
index 8b917017e59bdd57acf10e844f8948990b1c2457..55a1795ec0264bcdc15315dd5920b17d6c2a9197 100644 (file)
@@ -76,6 +76,9 @@ PBRS_INVARIANCE_TOL: float = 1e-6
 # Default discount factor γ for potential-based reward shaping
 POTENTIAL_GAMMA_DEFAULT: float = 0.95
 
+# Default risk/reward ratio (RR)
+RISK_REWARD_RATIO_DEFAULT: float = 2.0
+
 # Supported attenuation modes
 ATTENUATION_MODES: Tuple[str, ...] = ("sqrt", "linear", "power", "half_life")
 ATTENUATION_MODES_WITH_LEGACY: Tuple[str, ...] = ("legacy",) + ATTENUATION_MODES
@@ -150,7 +153,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = {
     "exit_potential_decay": 0.5,
     # Hold potential (PBRS function Φ)
     "hold_potential_enabled": True,
-    "hold_potential_ratio": 0.25,
+    "hold_potential_ratio": 0.015625,
     "hold_potential_gain": 1.0,
     "hold_potential_transform_pnl": "tanh",
     "hold_potential_transform_duration": "tanh",
@@ -580,14 +583,14 @@ def validate_reward_parameters(
     for bkey in _bool_keys:
         if bkey in sanitized:
             original_val = sanitized[bkey]
-            coerced = _to_bool(original_val)
-            if coerced is not original_val:
-                sanitized[bkey] = coerced
+            coerced_val = _to_bool(original_val)
+            if coerced_val is not original_val:
+                sanitized[bkey] = coerced_val
             adjustments.setdefault(
                 bkey,
                 {
                     "original": original_val,
-                    "adjusted": coerced,
+                    "adjusted": coerced_val,
                     "reason": "bool_coerce",
                     "validation_mode": "strict" if strict else "relaxed",
                 },
@@ -600,10 +603,10 @@ def validate_reward_parameters(
 
         original_val = sanitized[key]
         # Robust coercion to float using helper (handles None/str/bool/non-finite)
-        coerced = _get_float_param({key: original_val}, key, np.nan)
+        coerced_val = _get_float_param({key: original_val}, key, np.nan)
 
         # Handle non-numeric or unparsable values
-        if not np.isfinite(coerced):
+        if not np.isfinite(coerced_val):
             # Treat derived parameters specially: drop to allow downstream derivation
             if key == "max_idle_duration_candles":
                 # Remove the key so downstream helpers derive from max_trade_duration_candles
@@ -627,7 +630,7 @@ def validate_reward_parameters(
             }
             continue
 
-        original_numeric = float(coerced)
+        original_numeric = float(coerced_val)
 
         # Track type coercion
         if not isinstance(original_val, (int, float)):
@@ -982,7 +985,7 @@ def _compute_pnl_target_coefficient(
     if pnl_target > 0.0:
         win_reward_factor = _get_float_param(params, "win_reward_factor")
         pnl_factor_beta = _get_float_param(params, "pnl_factor_beta")
-        rr = risk_reward_ratio if risk_reward_ratio > 0 else 1.0
+        rr = risk_reward_ratio if risk_reward_ratio > 0 else RISK_REWARD_RATIO_DEFAULT
 
         pnl_ratio = pnl / pnl_target
         if abs(pnl_ratio) > 1.0:
@@ -3347,11 +3350,11 @@ def _compute_pnl_duration_signal(
     pnl_ratio = float(pnl / pnl_target)
     duration_ratio = float(np.clip(duration_ratio, 0.0, 1.0))
 
-    ratio = _get_float_param(params, scale_key, 0.25 if "hold" in scale_key else 0.125)
+    ratio = _get_float_param(params, scale_key)
     scale = ratio * base_factor
-    gain = _get_float_param(params, gain_key, 1.0)
-    transform_pnl = _get_str_param(params, transform_pnl_key, "tanh")
-    transform_duration = _get_str_param(params, transform_dur_key, "tanh")
+    gain = _get_float_param(params, gain_key)
+    transform_pnl = _get_str_param(params, transform_pnl_key)
+    transform_duration = _get_str_param(params, transform_dur_key)
 
     duration_multiplier = 1.0
     if risk_reward_ratio is not None:
@@ -3426,8 +3429,8 @@ def build_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--risk_reward_ratio",
         type=float,
-        default=1.0,
-        help="Risk reward ratio multiplier (default: 1.0).",
+        default=RISK_REWARD_RATIO_DEFAULT,
+        help=f"Risk reward ratio multiplier (default: {RISK_REWARD_RATIO_DEFAULT}).",
     )
     parser.add_argument(
         "--max_duration_ratio",
index 962df5d8a43c0fcf299e79ef1ccba2edbff16ef8..caca85b76e221e4a6e4454d8f9c04d91fee1504c 100644 (file)
@@ -176,7 +176,7 @@ class TestRewardComponents(RewardSpaceTestBase):
         config = RewardScenarioConfig(
             base_factor=PARAMS.BASE_FACTOR,
             profit_aim=PARAMS.PROFIT_AIM,
-            risk_reward_ratio=1.0,
+            risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             tolerance_relaxed=TOLERANCE.IDENTITY_RELAXED,
         )
         assert_reward_calculation_scenarios(
@@ -569,6 +569,8 @@ class TestRewardComponents(RewardSpaceTestBase):
         win_reward_factor = 3.0
         beta = 0.5
         profit_aim = PARAMS.PROFIT_AIM
+        risk_reward_ratio = PARAMS.RISK_REWARD_RATIO
+        pnl_target = profit_aim * risk_reward_ratio
         params = self.base_params(
             win_reward_factor=win_reward_factor,
             pnl_factor_beta=beta,
@@ -578,7 +580,7 @@ class TestRewardComponents(RewardSpaceTestBase):
             exit_linear_slope=0.0,
         )
         params.pop("base_factor", None)
-        pnl_values = [profit_aim * m for m in (1.05, PARAMS.RISK_REWARD_RATIO_HIGH, 5.0, 10.0)]
+        pnl_values = [pnl_target * m for m in (1.05, 2.0, 5.0, 10.0)]
         ratios_observed: list[float] = []
         for pnl in pnl_values:
             context = self.make_ctx(
@@ -591,7 +593,11 @@ class TestRewardComponents(RewardSpaceTestBase):
                 action=Actions.Long_exit,
             )
             br = calculate_reward_with_defaults(
-                context, params, base_factor=1.0, profit_aim=profit_aim
+                context,
+                params,
+                base_factor=1.0,
+                profit_aim=profit_aim,
+                risk_reward_ratio=risk_reward_ratio,
             )
             ratio = br.exit_component / pnl if pnl != 0 else 0.0
             ratios_observed.append(float(ratio))
@@ -611,7 +617,7 @@ class TestRewardComponents(RewardSpaceTestBase):
         )
         expected_ratios: list[float] = []
         for pnl in pnl_values:
-            pnl_ratio = pnl / profit_aim
+            pnl_ratio = pnl / pnl_target
             expected = 1.0 + win_reward_factor * math.tanh(beta * (pnl_ratio - 1.0))
             expected_ratios.append(expected)
         for obs, exp in zip(ratios_observed, expected_ratios):
@@ -633,7 +639,7 @@ class TestRewardComponents(RewardSpaceTestBase):
         """
         base_factor = PARAMS.BASE_FACTOR
         profit_aim = PARAMS.PROFIT_AIM
-        risk_reward_ratio = 1.0
+        risk_reward_ratio = PARAMS.RISK_REWARD_RATIO
         max_trade_duration_candles = PARAMS.TRADE_DURATION_MEDIUM
 
         params = self.base_params(
@@ -770,8 +776,12 @@ class TestRewardComponents(RewardSpaceTestBase):
         )
         params_rr.pop("risk_reward_ratio", None)
 
-        br_ratio = calculate_reward_with_defaults(context, params_ratio, risk_reward_ratio=1.0)
-        br_rr = calculate_reward_with_defaults(context, params_rr, risk_reward_ratio=1.0)
+        br_ratio = calculate_reward_with_defaults(
+            context, params_ratio, risk_reward_ratio=PARAMS.RISK_REWARD_RATIO
+        )
+        br_rr = calculate_reward_with_defaults(
+            context, params_rr, risk_reward_ratio=PARAMS.RISK_REWARD_RATIO
+        )
 
         self.assertAlmostEqualFloat(
             br_rr.total,
index 750dc93f37238f8e8b08dc1a6eac02f868f3e6b7..731f9979f2729faf9e50cc6c3fdac3b816a3a01b 100644 (file)
@@ -226,8 +226,8 @@ class TestParameters:
     Attributes:
         BASE_FACTOR: Default base factor for reward scaling (90.0)
         PROFIT_AIM: Target profit threshold (0.06)
-        RISK_REWARD_RATIO: Standard risk/reward ratio (1.0)
-        RISK_REWARD_RATIO_HIGH: High risk/reward ratio for stress tests (2.0)
+        RISK_REWARD_RATIO: Standard risk/reward ratio (2.0)
+        RISK_REWARD_RATIO_HIGH: High risk/reward ratio for stress tests (4.0)
         PNL_STD: Standard deviation for PnL generation (0.02)
         PNL_DUR_VOL_SCALE: Duration-based volatility scaling factor (0.001)
 
@@ -247,14 +247,17 @@ class TestParameters:
         MAX_TRADE_DURATION_HETEROSCEDASTICITY: Max trade duration used for heteroscedasticity tests (10)
 
         # Common additive parameters
-        ADDITIVE_RATIO_DEFAULT: Default additive ratio (0.4)
+        ADDITIVE_RATIO_DEFAULT: Default additive ratio (0.125)
         ADDITIVE_GAIN_DEFAULT: Default additive gain (1.0)
+
+        # PBRS hold potential parameters
+        HOLD_POTENTIAL_RATIO_DEFAULT: Default hold potential ratio (0.015625)
     """
 
     BASE_FACTOR: float = 90.0
     PROFIT_AIM: float = 0.06
-    RISK_REWARD_RATIO: float = 1.0
-    RISK_REWARD_RATIO_HIGH: float = 2.0
+    RISK_REWARD_RATIO: float = 2.0
+    RISK_REWARD_RATIO_HIGH: float = 4.0
     PNL_STD: float = 0.02
     PNL_DUR_VOL_SCALE: float = 0.001
 
@@ -274,9 +277,12 @@ class TestParameters:
     MAX_TRADE_DURATION_HETEROSCEDASTICITY: int = 10
 
     # Additive parameters
-    ADDITIVE_RATIO_DEFAULT: float = 0.4
+    ADDITIVE_RATIO_DEFAULT: float = 0.125
     ADDITIVE_GAIN_DEFAULT: float = 1.0
 
+    # PBRS hold potential parameters
+    HOLD_POTENTIAL_RATIO_DEFAULT: float = 0.015625
+
 
 @dataclass(frozen=True)
 class TestScenarios:
index 88f9fe20f49cd34f348c4c34a422c244d7afb953..530af449895722d47d1822470e408741b0c6e436 100644 (file)
@@ -530,7 +530,7 @@ def assert_exit_factor_attenuation_modes(
     attenuation_modes: Sequence[str],
     base_params_fn,
     tolerance_relaxed: float,
-    risk_reward_ratio: float = 1.0,
+    risk_reward_ratio: float = PARAMS.RISK_REWARD_RATIO,
 ):
     """Validate exit factor attenuation across multiple modes.
 
@@ -1067,7 +1067,7 @@ def assert_exit_factor_invariant_suite(
                 duration_ratio=case["duration_ratio"],
                 context=case["context"],
                 params=case["params"],
-                risk_reward_ratio=2.0,
+                risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             )
             exp = case.get("expectation")
             if exp == "safe_zero":
@@ -1124,7 +1124,7 @@ def assert_exit_factor_kernel_fallback(
             self, _get_exit_factor, 90.0, 0.08, 0.03, 0.5, test_context,
             bad_params={"exit_attenuation_mode": "power", "exit_power_tau": -1.0},
             reference_params={"exit_attenuation_mode": "linear"},
-            risk_reward_ratio=1.0
+            risk_reward_ratio=PARAMS.RISK_REWARD_RATIO
         )
     """
 
index 15e1211f99e98dc4c9716aadad0ac828cd45324f..61419c8dfe4f2f7388b1ca8badfbfee4ef3a2a64 100644 (file)
@@ -9,6 +9,7 @@ from reward_space_analysis import (
     _get_bool_param,
 )
 
+from ..constants import PARAMS
 from ..test_base import make_ctx
 from . import calculate_reward_with_defaults
 
@@ -74,7 +75,7 @@ def test_calculate_reward_unrealized_pnl_hold_path():
         params,
         base_factor=100.0,
         profit_aim=0.05,
-        risk_reward_ratio=1.0,
+        risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
         prev_potential=np.nan,
     )
     assert math.isfinite(breakdown.prev_potential)
index 0ea5f2391dc60f9716f279a77c82d5bdb9932582..7c4a8f163534ef0a658dfd6403826ea8bd42f179 100644 (file)
@@ -150,18 +150,17 @@ class ReforceXY(BaseReinforcementLearningModel):
 
     _LOG_2: Final[float] = math.log(2.0)
 
+    DEFAULT_BASE_FACTOR: Final[float] = 100.0
+
     DEFAULT_MAX_TRADE_DURATION_CANDLES: Final[int] = 128
     DEFAULT_IDLE_DURATION_MULTIPLIER: Final[int] = 4
 
-    DEFAULT_BASE_FACTOR: Final[float] = 100.0
-    DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0
-
     DEFAULT_EXIT_POTENTIAL_DECAY: Final[float] = 0.5
     DEFAULT_ENTRY_ADDITIVE_ENABLED: Final[bool] = False
     DEFAULT_ENTRY_ADDITIVE_RATIO: Final[float] = 0.125
     DEFAULT_ENTRY_ADDITIVE_GAIN: Final[float] = 1.0
     DEFAULT_HOLD_POTENTIAL_ENABLED: Final[bool] = True
-    DEFAULT_HOLD_POTENTIAL_RATIO: Final[float] = 0.25
+    DEFAULT_HOLD_POTENTIAL_RATIO: Final[float] = 0.015625
     DEFAULT_HOLD_POTENTIAL_GAIN: Final[float] = 1.0
     DEFAULT_EXIT_ADDITIVE_ENABLED: Final[bool] = False
     DEFAULT_EXIT_ADDITIVE_RATIO: Final[float] = 0.125
@@ -174,6 +173,7 @@ class ReforceXY(BaseReinforcementLearningModel):
 
     DEFAULT_PNL_FACTOR_BETA: Final[float] = 0.5
     DEFAULT_WIN_REWARD_FACTOR: Final[float] = 2.0
+    DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0
     DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5
 
     DEFAULT_INVALID_ACTION: Final[float] = -2.0
@@ -262,6 +262,7 @@ class ReforceXY(BaseReinforcementLearningModel):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+
         self.pairs: List[str] = self.config.get("exchange", {}).get("pair_whitelist")
         if not self.pairs:
             raise ValueError(
@@ -2432,7 +2433,7 @@ class MyRLEnv(Base5ActionRLEnv):
                 reward_shaping = gamma * next_potential - prev_potential
 
             if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
-                duration_ratio = trade_duration / max(max_trade_duration, 1)
+                duration_ratio = trade_duration / max(1, max_trade_duration)
                 exit_additive = self._compute_exit_additive(
                     pnl, pnl_target, duration_ratio, exit_additive_scale
                 )
@@ -4072,7 +4073,7 @@ def deepmerge(dst: Dict[str, Any], src: Dict[str, Any]) -> Dict[str, Any]:
 
 def _compute_gradient_steps(tf: int, ss: int) -> int:
     if tf > 0 and ss > 0:
-        return min(tf, max(math.ceil(tf / ss), 1))
+        return min(tf, max(1, math.ceil(tf / ss)))
     return -1