]> Piment Noir Git Repositories - freqai-strategies.git/commitdiff
fix(ReforceXY): enforce coherent scale for reward components
authorJérôme Benoit <jerome.benoit@piment-noir.org>
Wed, 24 Dec 2025 18:27:31 +0000 (19:27 +0100)
committerJérôme Benoit <jerome.benoit@piment-noir.org>
Wed, 24 Dec 2025 18:27:31 +0000 (19:27 +0100)
Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
.gitignore
ReforceXY/reward_space_analysis/README.md
ReforceXY/reward_space_analysis/reward_space_analysis.py
ReforceXY/reward_space_analysis/tests/components/test_additives.py
ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
ReforceXY/reward_space_analysis/tests/constants.py
ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
ReforceXY/reward_space_analysis/tests/test_base.py
ReforceXY/user_data/freqaimodels/ReforceXY.py

index 2e089ccf0ddbf5347e9afc42754e0c223d78f12e..8860197ed8fa4952fa0ed51ed21e9488febfeaf5 100644 (file)
@@ -379,7 +379,7 @@ config.json
 **/user_data/data/**
 !.gitkeep
 
-*/.serena
+*/.serena/
 */.serena/**
 */.clinerules
 */.clinerules/**
index 34bdc5a48747a9ebbdb89fbd76805e4ad32bd70e..cb8be7c7da9f5e5280ff841802a62fd04a0ddf36 100644 (file)
@@ -298,9 +298,9 @@ where `kernel_function` depends on `exit_attenuation_mode`. See [Exit Attenuatio
 | ---------------------------- | ------- | -------------------------- |
 | `max_trade_duration_candles` | 128     | Trade duration cap         |
 | `max_idle_duration_candles`  | None    | Fallback 4× trade duration |
-| `idle_penalty_scale`         | 0.5     | Idle penalty scale         |
+| `idle_penalty_scale`         | 1.0     | Idle penalty scale         |
 | `idle_penalty_power`         | 1.025   | Idle penalty exponent      |
-| `hold_penalty_scale`         | 0.25    | Hold penalty scale         |
+| `hold_penalty_scale`         | 1.0     | Hold penalty scale         |
 | `hold_penalty_power`         | 1.025   | Hold penalty exponent      |
 
 #### Validation
@@ -334,7 +334,7 @@ across samples) and does not apply any drift correction in post-processing.
 
 | Parameter                           | Default | Description          |
 | ----------------------------------- | ------- | -------------------- |
-| `hold_potential_scale`              | 1.0     | Hold potential scale |
+| `hold_potential_ratio`              | 0.25    | Hold potential ratio |
 | `hold_potential_gain`               | 1.0     | Gain multiplier      |
 | `hold_potential_transform_pnl`      | tanh    | PnL transform        |
 | `hold_potential_transform_duration` | tanh    | Duration transform   |
@@ -366,7 +366,7 @@ losses compared to symmetric treatment.
 | Parameter                           | Default | Description           |
 | ----------------------------------- | ------- | --------------------- |
 | `entry_additive_enabled`            | false   | Enable entry additive |
-| `entry_additive_scale`              | 1.0     | Scale                 |
+| `entry_additive_ratio`              | 0.125   | Ratio                 |
 | `entry_additive_gain`               | 1.0     | Gain                  |
 | `entry_additive_transform_pnl`      | tanh    | PnL transform         |
 | `entry_additive_transform_duration` | tanh    | Duration transform    |
@@ -376,7 +376,7 @@ losses compared to symmetric treatment.
 | Parameter                          | Default | Description          |
 | ---------------------------------- | ------- | -------------------- |
 | `exit_additive_enabled`            | false   | Enable exit additive |
-| `exit_additive_scale`              | 1.0     | Scale                |
+| `exit_additive_ratio`              | 0.125   | Ratio                |
 | `exit_additive_gain`               | 1.0     | Gain                 |
 | `exit_additive_transform_pnl`      | tanh    | PnL transform        |
 | `exit_additive_transform_duration` | tanh    | Duration transform   |
index 8ff185fea8176eaa8ef682c41482ed4f974d5e7d..9a78b1f80b7b3ec87d91b98876ab120bb32694aa 100644 (file)
@@ -117,13 +117,13 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = {
     "invalid_action": -2.0,
     "base_factor": 100.0,
     # Idle penalty defaults
-    "idle_penalty_scale": 0.5,
+    "idle_penalty_scale": 1.0,
     "idle_penalty_power": 1.025,
     "max_trade_duration_candles": 128,
     # Fallback: DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles
     "max_idle_duration_candles": None,
     # Hold penalty defaults
-    "hold_penalty_scale": 0.25,
+    "hold_penalty_scale": 1.0,
     "hold_penalty_power": 1.025,
     # Exit attenuation defaults
     "exit_attenuation_mode": "linear",
@@ -150,13 +150,13 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = {
     "exit_potential_decay": 0.5,
     # Hold potential (PBRS function Φ)
     "hold_potential_enabled": True,
-    "hold_potential_scale": 1.0,
+    "hold_potential_ratio": 0.25,
     "hold_potential_gain": 1.0,
     "hold_potential_transform_pnl": "tanh",
     "hold_potential_transform_duration": "tanh",
     # Entry additive (non-PBRS additive term)
     "entry_additive_enabled": False,
-    "entry_additive_scale": 1.0,
+    "entry_additive_ratio": 0.125,
     "entry_additive_gain": 1.0,
     "entry_additive_transform_pnl": "tanh",
     "entry_additive_transform_duration": "tanh",
@@ -164,7 +164,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = {
     "exit_fee_rate": 0.0,
     # Exit additive (non-PBRS additive term)
     "exit_additive_enabled": False,
-    "exit_additive_scale": 1.0,
+    "exit_additive_ratio": 0.125,
     "exit_additive_gain": 1.0,
     "exit_additive_transform_pnl": "tanh",
     "exit_additive_transform_duration": "tanh",
@@ -196,19 +196,19 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: Dict[str, str] = {
     "exit_potential_mode": "Exit potential mode (canonical|non_canonical|progressive_release|spike_cancel|retain_previous)",
     "exit_potential_decay": "Decay for progressive_release (0–1)",
     "hold_potential_enabled": "Enable hold potential Φ",
-    "hold_potential_scale": "Hold potential scale",
+    "hold_potential_ratio": "Hold potential ratio",
     "hold_potential_gain": "Hold potential gain",
     "hold_potential_transform_pnl": "Hold PnL transform",
     "hold_potential_transform_duration": "Hold duration transform",
     "entry_additive_enabled": "Enable entry additive",
-    "entry_additive_scale": "Entry additive scale",
+    "entry_additive_ratio": "Entry additive ratio",
     "entry_additive_gain": "Entry additive gain",
     "entry_additive_transform_pnl": "Entry PnL transform",
     "entry_additive_transform_duration": "Entry duration transform",
     "entry_fee_rate": "Entry fee rate",
     "exit_fee_rate": "Exit fee rate",
     "exit_additive_enabled": "Enable exit additive",
-    "exit_additive_scale": "Exit additive scale",
+    "exit_additive_ratio": "Exit additive ratio",
     "exit_additive_gain": "Exit additive gain",
     "exit_additive_transform_pnl": "Exit PnL transform",
     "exit_additive_transform_duration": "Exit duration transform",
@@ -240,13 +240,13 @@ _PARAMETER_BOUNDS: Dict[str, Dict[str, float]] = {
     # PBRS parameter bounds
     "potential_gamma": {"min": 0.0, "max": 1.0},
     "exit_potential_decay": {"min": 0.0, "max": 1.0},
-    "hold_potential_scale": {"min": 0.0},
+    "hold_potential_ratio": {"min": 0.0},
     "hold_potential_gain": {"min": 0.0},
-    "entry_additive_scale": {"min": 0.0},
+    "entry_additive_ratio": {"min": 0.0},
     "entry_additive_gain": {"min": 0.0},
     "entry_fee_rate": {"min": 0.0, "max": 0.1},
     "exit_fee_rate": {"min": 0.0, "max": 0.1},
-    "exit_additive_scale": {"min": 0.0},
+    "exit_additive_ratio": {"min": 0.0},
     "exit_additive_gain": {"min": 0.0},
 }
 
@@ -340,11 +340,6 @@ def _resolve_additive_enablement(
     return entry_additive_effective, exit_additive_effective, additives_suppressed
 
 
-def _is_strict_validation(params: RewardParams) -> bool:
-    """Return strict validation flag from params (default True)."""
-    return _get_bool_param(params, "strict_validation", True)
-
-
 def _get_float_param(params: RewardParams, key: str, default: RewardParamValue) -> float:
     """Extract float parameter with type safety and default fallback."""
     value = params.get(key, default)
@@ -486,7 +481,7 @@ def _is_short_allowed(trading_mode: str) -> bool:
 
 
 def _fail_safely(reason: str) -> float:
-    """Return 0.0 on recoverable numeric failure."""
+    """Return 0.0 on numeric failure."""
     _ = reason
     return 0.0
 
@@ -794,22 +789,20 @@ def _compute_time_attenuation_coefficient(
         return 1.0 / (1.0 + exit_linear_slope * dr)
 
     def _power_kernel(dr: float) -> float:
-        tau = _get_float_param(
-            params,
-            "exit_power_tau",
-            DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_power_tau", 0.5),
-        )
-        if 0.0 < tau <= 1.0:
-            alpha = -math.log(tau) / _LOG_2
-        else:
-            if _is_strict_validation(params):
-                raise ValueError(f"exit_power_tau={tau} must be in (0,1] in strict mode")
-            warnings.warn(
-                f"exit_power_tau={tau} invalid; falling back to alpha=1.0",
-                RewardDiagnosticsWarning,
-                stacklevel=2,
-            )
+        raw_tau = params.get("exit_power_tau", None)
+        if raw_tau is None:
             alpha = 1.0
+        else:
+            tau = _get_float_param(params, "exit_power_tau", np.nan)
+            if 0.0 < tau <= 1.0:
+                alpha = -math.log(tau) / _LOG_2
+            else:
+                warnings.warn(
+                    f"exit_power_tau={tau} invalid; falling back to alpha=1.0",
+                    RewardDiagnosticsWarning,
+                    stacklevel=2,
+                )
+                alpha = 1.0
         return 1.0 / math.pow(1.0 + dr, alpha)
 
     def _half_life_kernel(dr: float) -> float:
@@ -818,8 +811,6 @@ def _compute_time_attenuation_coefficient(
             "exit_half_life",
             DEFAULT_MODEL_REWARD_PARAMETERS.get("exit_half_life", 0.5),
         )
-        if hl <= 0.0 and _is_strict_validation(params):
-            raise ValueError(f"exit_half_life={hl} must be > 0 in strict mode")
         if np.isclose(hl, 0.0):
             warnings.warn(
                 f"exit_half_life={hl} close to 0; falling back to 1.0",
@@ -827,6 +818,13 @@ def _compute_time_attenuation_coefficient(
                 stacklevel=2,
             )
             return 1.0
+        if hl < 0.0:
+            warnings.warn(
+                f"exit_half_life={hl} negative; falling back to 1.0",
+                RewardDiagnosticsWarning,
+                stacklevel=2,
+            )
+            return 1.0
         return math.pow(2.0, -dr / hl)
 
     kernels = {
@@ -1100,7 +1098,7 @@ def _idle_penalty(context: RewardContext, idle_factor: float, params: RewardPara
     idle_penalty_scale = _get_float_param(
         params,
         "idle_penalty_scale",
-        DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 0.5),
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("idle_penalty_scale", 1.0),
     )
     idle_penalty_power = _get_float_param(
         params,
@@ -1117,7 +1115,7 @@ def _hold_penalty(context: RewardContext, hold_factor: float, params: RewardPara
     hold_penalty_scale = _get_float_param(
         params,
         "hold_penalty_scale",
-        DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_scale", 0.25),
+        DEFAULT_MODEL_REWARD_PARAMETERS.get("hold_penalty_scale", 1.0),
     )
     hold_penalty_power = _get_float_param(
         params,
@@ -1199,10 +1197,12 @@ def calculate_reward(
 
     if "risk_reward_ratio" in params:
         risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(risk_reward_ratio))
+    elif "rr" in params:
+        risk_reward_ratio = _get_float_param(params, "rr", float(risk_reward_ratio))
 
     pnl_target = float(profit_aim * risk_reward_ratio)
 
-    idle_factor = base_factor * (profit_aim / risk_reward_ratio) / 4.0
+    idle_factor = base_factor * (profit_aim / risk_reward_ratio)
     hold_factor = idle_factor
 
     max_trade_duration_candles = _get_int_param(
@@ -1366,6 +1366,7 @@ def calculate_reward(
                 prev_potential=prev_potential,
                 params=params,
                 risk_reward_ratio=risk_reward_ratio,
+                base_factor=base_factor,
             )
         )
 
@@ -3133,6 +3134,7 @@ def _compute_hold_potential(
     duration_ratio: float,
     risk_reward_ratio: float,
     params: RewardParams,
+    base_factor: float,
 ) -> float:
     """Compute PBRS hold potential Φ(s)."""
     if not _get_bool_param(
@@ -3148,12 +3150,13 @@ def _compute_hold_potential(
         pnl_target=pnl_target,
         duration_ratio=duration_ratio,
         params=params,
-        scale_key="hold_potential_scale",
+        scale_key="hold_potential_ratio",
         gain_key="hold_potential_gain",
         transform_pnl_key="hold_potential_transform_pnl",
         transform_dur_key="hold_potential_transform_duration",
         non_finite_key="non_finite_hold_potential",
         risk_reward_ratio=risk_reward_ratio,
+        base_factor=base_factor,
     )
 
 
@@ -3162,6 +3165,7 @@ def _compute_entry_additive(
     pnl_target: float,
     duration_ratio: float,
     params: RewardParams,
+    base_factor: float,
 ) -> float:
     if not _get_bool_param(
         params,
@@ -3175,11 +3179,12 @@ def _compute_entry_additive(
         pnl_target=pnl_target,
         duration_ratio=duration_ratio,
         params=params,
-        scale_key="entry_additive_scale",
+        scale_key="entry_additive_ratio",
         gain_key="entry_additive_gain",
         transform_pnl_key="entry_additive_transform_pnl",
         transform_dur_key="entry_additive_transform_duration",
         non_finite_key="non_finite_entry_additive",
+        base_factor=base_factor,
     )
 
 
@@ -3188,6 +3193,7 @@ def _compute_exit_additive(
     pnl_target: float,
     duration_ratio: float,
     params: RewardParams,
+    base_factor: float,
 ) -> float:
     if not _get_bool_param(
         params,
@@ -3201,11 +3207,12 @@ def _compute_exit_additive(
         pnl_target=pnl_target,
         duration_ratio=duration_ratio,
         params=params,
-        scale_key="exit_additive_scale",
+        scale_key="exit_additive_ratio",
         gain_key="exit_additive_gain",
         transform_pnl_key="exit_additive_transform_pnl",
         transform_dur_key="exit_additive_transform_duration",
         non_finite_key="non_finite_exit_additive",
+        base_factor=base_factor,
     )
 
 
@@ -3271,10 +3278,11 @@ def compute_pbrs_components(
     next_duration_ratio: float,
     params: RewardParams,
     *,
+    base_factor: float,
     risk_reward_ratio: float,
+    prev_potential: float,
     is_exit: bool = False,
     is_entry: bool = False,
-    prev_potential: float,
 ) -> tuple[float, float, float, float, float]:
     """Compute potential-based reward shaping (PBRS) components.
 
@@ -3333,6 +3341,7 @@ def compute_pbrs_components(
                 next_duration_ratio,
                 risk_reward_ratio,
                 params,
+                base_factor,
             )
             pbrs_delta = gamma * next_potential - prev_potential
             reward_shaping = pbrs_delta
@@ -3341,9 +3350,11 @@ def compute_pbrs_components(
         entry_additive = 0.0
         exit_additive = 0.0
     else:
-        cand_entry_add = _compute_entry_additive(next_pnl, pnl_target, next_duration_ratio, params)
+        cand_entry_add = _compute_entry_additive(
+            next_pnl, pnl_target, next_duration_ratio, params, base_factor
+        )
         cand_exit_add = _compute_exit_additive(
-            current_pnl, pnl_target, current_duration_ratio, params
+            current_pnl, pnl_target, current_duration_ratio, params, base_factor
         )
         entry_additive = cand_entry_add if is_entry else 0.0
         exit_additive = cand_exit_add if is_exit else 0.0
@@ -3375,10 +3386,11 @@ def apply_potential_shaping(
     next_duration_ratio: float,
     params: RewardParams,
     *,
+    base_factor: float,
     risk_reward_ratio: float,
+    prev_potential: float,
     is_exit: bool = False,
     is_entry: bool = False,
-    prev_potential: float,
 ) -> tuple[float, float, float, float, float, float]:
     """Compute shaped reward and PBRS diagnostics.
 
@@ -3403,10 +3415,11 @@ def apply_potential_shaping(
             next_pnl,
             next_duration_ratio,
             params,
+            base_factor=base_factor,
             risk_reward_ratio=risk_reward_ratio,
+            prev_potential=prev_potential,
             is_exit=is_exit,
             is_entry=is_entry,
-            prev_potential=prev_potential,
         )
     )
 
@@ -3436,6 +3449,7 @@ def _compute_bi_component(
     transform_dur_key: str,
     non_finite_key: str,
     *,
+    base_factor: float,
     risk_reward_ratio: Optional[float] = None,
 ) -> float:
     """Generic helper for (pnl, duration) bi-component transforms."""
@@ -3447,7 +3461,8 @@ def _compute_bi_component(
     pnl_ratio = float(pnl / pnl_target)
     duration_ratio = float(np.clip(duration_ratio, 0.0, 1.0))
 
-    scale = _get_float_param(params, scale_key, 1.0)
+    ratio = _get_float_param(params, scale_key, 0.25 if "hold" in scale_key else 0.125)
+    scale = ratio * base_factor
     gain = _get_float_param(params, gain_key, 1.0)
     transform_pnl = _get_str_param(params, transform_pnl_key, "tanh")
     transform_duration = _get_str_param(params, transform_dur_key, "tanh")
@@ -4422,7 +4437,11 @@ def main() -> None:
 
     base_factor = _get_float_param(params, "base_factor", float(args.base_factor))
     profit_aim = _get_float_param(params, "profit_aim", float(args.profit_aim))
-    risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(args.risk_reward_ratio))
+    risk_reward_ratio = _get_float_param(
+        params,
+        "risk_reward_ratio",
+        _get_float_param(params, "rr", float(args.risk_reward_ratio)),
+    )
 
     cli_action_masking = _to_bool(args.action_masking)
     if "action_masking" in params:
index 9df0dcdcda178be4cb957be5aab6698686546917..9d4508d216da7995c1bf2ad694c21f819858089d 100644 (file)
@@ -32,7 +32,7 @@ class TestAdditivesDeterministicContribution(RewardSpaceTestBase):
         **Setup:**
         - Base configuration: hold_potential enabled, additives disabled
         - Test configuration: entry_additive and exit_additive enabled
-        - Additive parameters: scale=0.4, gain=1.0 for both entry/exit
+        - Additive parameters: ratio=0.4, gain=1.0 for both entry/exit
         - Context: base_reward=0.05, pnl=0.01, duration_ratio=0.2
 
         **Assertions:**
@@ -55,8 +55,8 @@ class TestAdditivesDeterministicContribution(RewardSpaceTestBase):
             {
                 "entry_additive_enabled": True,
                 "exit_additive_enabled": True,
-                "entry_additive_scale": 0.4,
-                "exit_additive_scale": 0.4,
+                "entry_additive_ratio": 0.4,
+                "exit_additive_ratio": 0.4,
                 "entry_additive_gain": 1.0,
                 "exit_additive_gain": 1.0,
             }
@@ -73,11 +73,17 @@ class TestAdditivesDeterministicContribution(RewardSpaceTestBase):
             "is_exit": False,
         }
         s0, _n0, _pbrs0, _entry0, _exit0 = compute_pbrs_components(
-            prev_potential=0.0, params=base, **ctx
+            params=base,
+            base_factor=PARAMS.BASE_FACTOR,
+            prev_potential=0.0,
+            **ctx,
         )
         t0 = base_reward + s0 + _entry0 + _exit0
         s1, _n1, _pbrs1, _entry1, _exit1 = compute_pbrs_components(
-            prev_potential=0.0, params=with_add, **ctx
+            params=with_add,
+            base_factor=PARAMS.BASE_FACTOR,
+            prev_potential=0.0,
+            **ctx,
         )
         t1 = base_reward + s1 + _entry1 + _exit1
         self.assertFinite(t1)
index 4f398b97ea560ae6d7942c5949c874407fee7d42..0a3673e2e5644b2217e5cc7a40db78372327311b 100644 (file)
@@ -7,6 +7,7 @@ import unittest
 import pytest
 
 from reward_space_analysis import (
+    DEFAULT_IDLE_DURATION_MULTIPLIER,
     Actions,
     Positions,
     _compute_efficiency_coefficient,
@@ -15,6 +16,7 @@ from reward_space_analysis import (
     _get_exit_factor,
     _get_float_param,
     calculate_reward,
+    get_max_idle_duration_candles,
 )
 
 from ..constants import PARAMS, SCENARIOS, TOLERANCE
@@ -39,7 +41,7 @@ class TestRewardComponents(RewardSpaceTestBase):
         """Test hold potential computation returns finite values."""
         params = {
             "hold_potential_enabled": True,
-            "hold_potential_scale": 1.0,
+            "hold_potential_ratio": 1.0,
             "hold_potential_gain": 1.0,
             "hold_potential_transform_pnl": "tanh",
             "hold_potential_transform_duration": "tanh",
@@ -50,6 +52,7 @@ class TestRewardComponents(RewardSpaceTestBase):
             0.3,
             PARAMS.RISK_REWARD_RATIO,
             params,
+            PARAMS.BASE_FACTOR,
         )
         self.assertFinite(val, name="hold_potential")
 
@@ -672,10 +675,23 @@ class TestRewardComponents(RewardSpaceTestBase):
         - penalty(duration=40) ≈ 2 × penalty(duration=20)
         - Proportional scaling with idle duration
         """
-        params = self.base_params(max_idle_duration_candles=None, max_trade_duration_candles=100)
         base_factor = PARAMS.BASE_FACTOR
         profit_aim = PARAMS.PROFIT_AIM
         risk_reward_ratio = 1.0
+        max_trade_duration_candles = 100
+        params = self.base_params(
+            max_idle_duration_candles=None,
+            max_trade_duration_candles=max_trade_duration_candles,
+            base_factor=base_factor,
+        )
+        expected_max_idle_duration_candles = int(
+            DEFAULT_IDLE_DURATION_MULTIPLIER * max_trade_duration_candles
+        )
+        self.assertEqual(
+            get_max_idle_duration_candles(params),
+            expected_max_idle_duration_candles,
+            "Expected fallback max_idle_duration from max_trade_duration",
+        )
 
         base_context_kwargs = {
             "pnl": 0.0,
@@ -711,15 +727,18 @@ class TestRewardComponents(RewardSpaceTestBase):
         if ratio is not None:
             self.assertAlmostEqualFloat(abs(ratio), 2.0, tolerance=0.2)
 
-        idle_penalty_scale = _get_float_param(params, "idle_penalty_scale", 0.5)
+        idle_penalty_scale = _get_float_param(params, "idle_penalty_scale", 1.0)
         idle_penalty_power = _get_float_param(params, "idle_penalty_power", 1.025)
-        base_factor = _get_float_param(params, "base_factor", float(base_factor))
-        risk_reward_ratio = _get_float_param(params, "risk_reward_ratio", float(risk_reward_ratio))
-        idle_factor = base_factor * (profit_aim / risk_reward_ratio) / 4.0
+        idle_factor = base_factor * (profit_aim / risk_reward_ratio)
         observed_ratio = abs(br_mid.idle_penalty) / (idle_factor * idle_penalty_scale)
         if observed_ratio > 0:
             implied_max_idle_duration_candles = 120 / observed_ratio ** (1 / idle_penalty_power)
-            self.assertAlmostEqualFloat(implied_max_idle_duration_candles, 400.0, tolerance=20.0)
+            tolerance = 0.05 * expected_max_idle_duration_candles
+            self.assertAlmostEqualFloat(
+                implied_max_idle_duration_candles,
+                float(expected_max_idle_duration_candles),
+                tolerance=tolerance,
+            )
 
     # Owns invariant: components-pbrs-breakdown-fields-119
     def test_pbrs_breakdown_fields_finite_and_aligned(self):
@@ -777,6 +796,65 @@ class TestRewardComponents(RewardSpaceTestBase):
             msg="invariance_correction should be ~0 in canonical mode",
         )
 
+    def test_rr_alias_matches_risk_reward_ratio(self):
+        """`rr` param alias matches `risk_reward_ratio` runtime naming."""
+        context = self.make_ctx(
+            pnl=0.02,
+            trade_duration=40,
+            idle_duration=0,
+            max_unrealized_profit=0.03,
+            min_unrealized_profit=0.01,
+            position=Positions.Long,
+            action=Actions.Long_exit,
+        )
+        rr_value = 1.75
+
+        # Canonical spelling
+        params_ratio = self.base_params(
+            exit_potential_mode="canonical",
+            risk_reward_ratio=rr_value,
+        )
+        params_ratio.pop("rr", None)
+
+        # Runtime spelling
+        params_rr = self.base_params(
+            exit_potential_mode="canonical",
+            rr=rr_value,
+        )
+        params_rr.pop("risk_reward_ratio", None)
+
+        br_ratio = calculate_reward(
+            context,
+            params_ratio,
+            base_factor=PARAMS.BASE_FACTOR,
+            profit_aim=PARAMS.PROFIT_AIM,
+            risk_reward_ratio=1.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+        br_rr = calculate_reward(
+            context,
+            params_rr,
+            base_factor=PARAMS.BASE_FACTOR,
+            profit_aim=PARAMS.PROFIT_AIM,
+            risk_reward_ratio=1.0,
+            short_allowed=True,
+            action_masking=True,
+        )
+
+        self.assertAlmostEqualFloat(
+            br_rr.total,
+            br_ratio.total,
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            msg="Total reward should match when using rr alias",
+        )
+        self.assertAlmostEqualFloat(
+            br_rr.exit_component,
+            br_ratio.exit_component,
+            tolerance=TOLERANCE.IDENTITY_STRICT,
+            msg="Exit component should match when using rr alias",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
index 1a356b34cafc2f3e00ea13fbf22c776b338e8cd5..b44d7dd5d63a37114ee0e53aceac3e6c828375fb 100644 (file)
@@ -106,12 +106,12 @@ class PBRSConfig:
 
     Attributes:
         TERMINAL_TOL: Terminal potential must be within this tolerance of zero (1e-09)
-        MAX_ABS_SHAPING: Maximum absolute shaping value for bounded checks (10.0)
+        MAX_ABS_SHAPING: Maximum absolute shaping value for bounded checks (50.0)
         TERMINAL_PROBABILITY: Default probability of terminal state in sweeps (0.08)
     """
 
     TERMINAL_TOL: float = 1e-09
-    MAX_ABS_SHAPING: float = 10.0
+    MAX_ABS_SHAPING: float = 50.0
     TERMINAL_PROBABILITY: float = 0.08
 
 
@@ -238,7 +238,7 @@ class TestParameters:
         TRADE_DURATION_LONG: Long trade duration in steps (200)
 
         # Common additive parameters
-        ADDITIVE_SCALE_DEFAULT: Default additive scale factor (0.4)
+        ADDITIVE_RATIO_DEFAULT: Default additive ratio (0.4)
         ADDITIVE_GAIN_DEFAULT: Default additive gain (1.0)
     """
 
@@ -260,7 +260,7 @@ class TestParameters:
     TRADE_DURATION_LONG: int = 200
 
     # Additive parameters
-    ADDITIVE_SCALE_DEFAULT: float = 0.4
+    ADDITIVE_RATIO_DEFAULT: float = 0.4
     ADDITIVE_GAIN_DEFAULT: float = 1.0
 
 
index 7992ef0aa786336ad77b247a21f38b351e65bfce..e991c7239b711e362213980a14ed61babd64f6ee 100644 (file)
@@ -79,6 +79,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_dur,
             PARAMS.RISK_REWARD_RATIO,
             params,
+            PARAMS.BASE_FACTOR,
         )
         (
             _total_reward,
@@ -94,6 +95,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=current_dur,
             next_pnl=0.0,
             next_duration_ratio=0.0,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=True,
             is_entry=False,
@@ -126,7 +128,9 @@ class TestPBRS(RewardSpaceTestBase):
             current_dur,
             PARAMS.RISK_REWARD_RATIO,
             params,
+            PARAMS.BASE_FACTOR,
         )
+
         gamma = _get_float_param(
             params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95)
         )
@@ -147,6 +151,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=current_dur,
             next_pnl=0.0,
             next_duration_ratio=0.0,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=True,
             is_entry=False,
@@ -226,14 +231,22 @@ class TestPBRS(RewardSpaceTestBase):
 
     def test_additive_components_disabled_return_zero(self):
         """Verifies entry/exit additives return zero when disabled."""
-        params_entry = {"entry_additive_enabled": False, "entry_additive_scale": 1.0}
+        params_entry = {"entry_additive_enabled": False, "entry_additive_ratio": 1.0}
         val_entry = _compute_entry_additive(
-            0.5, PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO, 0.3, params_entry
+            0.5,
+            PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
+            0.3,
+            params_entry,
+            PARAMS.BASE_FACTOR,
         )
         self.assertEqual(float(val_entry), 0.0)
-        params_exit = {"exit_additive_enabled": False, "exit_additive_scale": 1.0}
+        params_exit = {"exit_additive_enabled": False, "exit_additive_ratio": 1.0}
         val_exit = _compute_exit_additive(
-            0.5, PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO, 0.3, params_exit
+            0.5,
+            PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
+            0.3,
+            params_exit,
+            PARAMS.BASE_FACTOR,
         )
         self.assertEqual(float(val_exit), 0.0)
 
@@ -260,6 +273,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=0.0,
             next_pnl=0.01,
             next_duration_ratio=0.0,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=False,
             is_entry=True,
@@ -301,6 +315,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=0.4,
             next_pnl=0.02,
             next_duration_ratio=0.41,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=False,
             is_entry=False,
@@ -386,6 +401,7 @@ class TestPBRS(RewardSpaceTestBase):
                 next_pnl=next_pnl,
                 next_duration_ratio=next_duration_ratio,
                 risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+                base_factor=PARAMS.BASE_FACTOR,
                 is_exit=True,
                 is_entry=False,
                 prev_potential=0.789,
@@ -407,8 +423,8 @@ class TestPBRS(RewardSpaceTestBase):
             hold_potential_enabled=True,
             entry_additive_enabled=True,
             exit_additive_enabled=True,
-            entry_additive_scale=10.0,
-            exit_additive_scale=10.0,
+            entry_additive_ratio=10.0,
+            exit_additive_ratio=10.0,
         )
 
         (
@@ -425,6 +441,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=0.0,
             next_pnl=0.02,
             next_duration_ratio=0.0,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=False,
             is_entry=True,
@@ -443,6 +460,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_dur,
             PARAMS.RISK_REWARD_RATIO,
             params,
+            PARAMS.BASE_FACTOR,
         )
 
         (
@@ -459,6 +477,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=current_dur,
             next_pnl=0.0,
             next_duration_ratio=0.0,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=True,
             is_entry=False,
@@ -502,6 +521,7 @@ class TestPBRS(RewardSpaceTestBase):
                 next_pnl=0.0,
                 next_duration_ratio=0.0,
                 risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+                base_factor=PARAMS.BASE_FACTOR,
                 is_exit=True,
                 prev_potential=prev_potential,
                 params=params,
@@ -535,6 +555,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=0.2,
             next_pnl=0.035,
             next_duration_ratio=0.25,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=False,
             prev_potential=0.0,
@@ -548,6 +569,7 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio=0.2,
             next_pnl=0.035,
             next_duration_ratio=0.25,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=False,
             prev_potential=0.0,
@@ -723,6 +745,7 @@ class TestPBRS(RewardSpaceTestBase):
             entry_additive_enabled=False,
             exit_additive_enabled=False,
             potential_gamma=0.9,
+            base_factor=PARAMS.BASE_FACTOR,
         )
 
         trade_duration = 5
@@ -752,6 +775,7 @@ class TestPBRS(RewardSpaceTestBase):
             duration_ratio=(trade_duration / max_trade_duration_candles),
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             params=params,
+            base_factor=PARAMS.BASE_FACTOR,
         )
         self.assertAlmostEqualFloat(
             breakdown.next_potential,
@@ -766,13 +790,13 @@ class TestPBRS(RewardSpaceTestBase):
         """Batch validate strict failures + relaxed multi-reason aggregation via helpers."""
         strict_failures = [
             build_validation_case({"potential_gamma": -0.2}, strict=True, expect_error=True),
-            build_validation_case({"hold_potential_scale": -5.0}, strict=True, expect_error=True),
+            build_validation_case({"hold_potential_ratio": -5.0}, strict=True, expect_error=True),
         ]
         success_case = build_validation_case({}, strict=True, expect_error=False)
         relaxed_case = build_validation_case(
             {
                 "potential_gamma": "not-a-number",
-                "hold_potential_scale": "-5.0",
+                "hold_potential_ratio": "-5.0",
                 "max_idle_duration_candles": "nan",
             },
             strict=False,
@@ -793,7 +817,7 @@ class TestPBRS(RewardSpaceTestBase):
         params_relaxed.update(
             {
                 "potential_gamma": "not-a-number",
-                "hold_potential_scale": "-5.0",
+                "hold_potential_ratio": "-5.0",
                 "max_idle_duration_candles": "nan",
             }
         )
@@ -803,7 +827,7 @@ class TestPBRS(RewardSpaceTestBase):
             params_relaxed,
             {
                 "potential_gamma": ["non_numeric_reset"],
-                "hold_potential_scale": ["numeric_coerce", "min="],
+                "hold_potential_ratio": ["numeric_coerce", "min="],
                 "max_idle_duration_candles": ["derived_default"],
             },
         )
@@ -818,7 +842,7 @@ class TestPBRS(RewardSpaceTestBase):
             potential_gamma=gamma,
             entry_additive_enabled=False,
             exit_additive_enabled=False,
-            hold_potential_scale=1.0,
+            hold_potential_ratio=1.0,
         )
         ctx_pnl = 0.012
         ctx_dur_ratio = 0.3
@@ -829,6 +853,7 @@ class TestPBRS(RewardSpaceTestBase):
             ctx_dur_ratio,
             PARAMS.RISK_REWARD_RATIO,
             params_can,
+            PARAMS.BASE_FACTOR,
         )
         self.assertFinite(prev_phi, name="prev_phi")
         next_phi_can = _compute_exit_potential(prev_phi, params_can)
@@ -892,6 +917,9 @@ class TestPBRS(RewardSpaceTestBase):
             potential_gamma=0.94,
         )
         prev_potential = 0.42
+        current_pnl = 0.02
+        current_dur = 0.5
+        profit_aim = PARAMS.PROFIT_AIM
         (
             _total_reward,
             reward_shaping,
@@ -901,11 +929,12 @@ class TestPBRS(RewardSpaceTestBase):
             _exit_additive,
         ) = apply_potential_shaping(
             base_reward=0.0,
-            current_pnl=0.012,
-            pnl_target=PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO,
-            current_duration_ratio=0.3,
+            current_pnl=current_pnl,
+            pnl_target=profit_aim * PARAMS.RISK_REWARD_RATIO,
+            current_duration_ratio=current_dur,
             next_pnl=0.0,
             next_duration_ratio=0.0,
+            base_factor=PARAMS.BASE_FACTOR,
             risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
             is_exit=True,
             is_entry=False,
@@ -936,6 +965,7 @@ class TestPBRS(RewardSpaceTestBase):
             entry_additive_enabled=False,
             exit_additive_enabled=False,
             potential_gamma=0.9,
+            base_factor=PARAMS.BASE_FACTOR,
         )
         pnl_target = PARAMS.PROFIT_AIM * PARAMS.RISK_REWARD_RATIO
         ctx = self.make_ctx(
@@ -955,7 +985,9 @@ class TestPBRS(RewardSpaceTestBase):
             current_duration_ratio,
             PARAMS.RISK_REWARD_RATIO,
             params,
+            PARAMS.BASE_FACTOR,
         )
+
         self.assertNotEqual(prev_potential, 0.0)
 
         breakdown = calculate_reward(
@@ -1098,6 +1130,7 @@ class TestPBRS(RewardSpaceTestBase):
                     next_pnl=0.025,
                     next_duration_ratio=0.35,
                     risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+                    base_factor=PARAMS.BASE_FACTOR,
                     is_exit=False,
                     prev_potential=0.0,
                     params=params,
@@ -1139,6 +1172,7 @@ class TestPBRS(RewardSpaceTestBase):
                     next_pnl=next_pnl,
                     next_duration_ratio=next_dur,
                     risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+                    base_factor=PARAMS.BASE_FACTOR,
                     is_exit=is_exit,
                     prev_potential=prev_potential,
                     params=params,
@@ -1192,6 +1226,7 @@ class TestPBRS(RewardSpaceTestBase):
                     next_pnl=next_pnl,
                     next_duration_ratio=next_dur,
                     risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+                    base_factor=PARAMS.BASE_FACTOR,
                     is_exit=is_exit,
                     prev_potential=prev_potential,
                     params=params,
@@ -1202,7 +1237,7 @@ class TestPBRS(RewardSpaceTestBase):
         self.assertGreater(
             abs(shaping_sum),
             PBRS_INVARIANCE_TOL * 50,
-            f"Expected non-zero Σ shaping (got {shaping_sum})",
+            f"Expected non-zero shaping (got {shaping_sum})",
         )
 
     # Non-owning smoke; ownership: robustness/test_robustness.py:35 (robustness-decomposition-integrity-101)
index abd7cec358c122750dd4fcbb38a65228ed931a4d..0867b0873fce874cc7c4fdea79d8a0c492f155cb 100644 (file)
@@ -57,7 +57,7 @@ def make_ctx(
 PBRS_INTEGRATION_PARAMS = [
     "potential_gamma",
     "hold_potential_enabled",
-    "hold_potential_scale",
+    "hold_potential_ratio",
     "entry_additive_enabled",
     "exit_additive_enabled",
 ]
@@ -144,10 +144,11 @@ class RewardSpaceTestBase(unittest.TestCase):
                     current_duration_ratio=current_dur,
                     next_pnl=next_pnl,
                     next_duration_ratio=next_dur,
+                    base_factor=PARAMS.BASE_FACTOR,
                     risk_reward_ratio=PARAMS.RISK_REWARD_RATIO,
+                    prev_potential=prev_potential,
                     is_exit=is_exit,
                     is_entry=False,
-                    prev_potential=prev_potential,
                     params=params,
                 )
             )
index 9c8ca33cc468dc9cf751ab194515df11f1756953..0b853607dd4acc818160aac3be9ff3acf44d245b 100644 (file)
@@ -158,13 +158,13 @@ class ReforceXY(BaseReinforcementLearningModel):
 
     DEFAULT_EXIT_POTENTIAL_DECAY: Final[float] = 0.5
     DEFAULT_ENTRY_ADDITIVE_ENABLED: Final[bool] = False
-    DEFAULT_ENTRY_ADDITIVE_SCALE: Final[float] = 1.0
+    DEFAULT_ENTRY_ADDITIVE_RATIO: Final[float] = 0.125
     DEFAULT_ENTRY_ADDITIVE_GAIN: Final[float] = 1.0
     DEFAULT_HOLD_POTENTIAL_ENABLED: Final[bool] = True
-    DEFAULT_HOLD_POTENTIAL_SCALE: Final[float] = 1.0
+    DEFAULT_HOLD_POTENTIAL_RATIO: Final[float] = 0.25
     DEFAULT_HOLD_POTENTIAL_GAIN: Final[float] = 1.0
     DEFAULT_EXIT_ADDITIVE_ENABLED: Final[bool] = False
-    DEFAULT_EXIT_ADDITIVE_SCALE: Final[float] = 1.0
+    DEFAULT_EXIT_ADDITIVE_RATIO: Final[float] = 0.125
     DEFAULT_EXIT_ADDITIVE_GAIN: Final[float] = 1.0
 
     DEFAULT_EXIT_PLATEAU: Final[bool] = True
@@ -177,9 +177,9 @@ class ReforceXY(BaseReinforcementLearningModel):
     DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5
 
     DEFAULT_INVALID_ACTION: Final[float] = -2.0
-    DEFAULT_IDLE_PENALTY_SCALE: Final[float] = 0.5
+    DEFAULT_IDLE_PENALTY_SCALE: Final[float] = 1.0
     DEFAULT_IDLE_PENALTY_POWER: Final[float] = 1.025
-    DEFAULT_HOLD_PENALTY_SCALE: Final[float] = 0.25
+    DEFAULT_HOLD_PENALTY_SCALE: Final[float] = 1.0
     DEFAULT_HOLD_PENALTY_POWER: Final[float] = 1.025
 
     DEFAULT_CHECK_INVARIANTS: Final[bool] = True
@@ -1775,9 +1775,9 @@ class MyRLEnv(Base5ActionRLEnv):
                 "entry_additive_enabled", ReforceXY.DEFAULT_ENTRY_ADDITIVE_ENABLED
             )
         )
-        self._entry_additive_scale: float = float(
+        self._entry_additive_ratio: float = float(
             model_reward_parameters.get(
-                "entry_additive_scale", ReforceXY.DEFAULT_ENTRY_ADDITIVE_SCALE
+                "entry_additive_ratio", ReforceXY.DEFAULT_ENTRY_ADDITIVE_RATIO
             )
         )
         self._entry_additive_gain: float = float(
@@ -1803,9 +1803,9 @@ class MyRLEnv(Base5ActionRLEnv):
                 "hold_potential_enabled", ReforceXY.DEFAULT_HOLD_POTENTIAL_ENABLED
             )
         )
-        self._hold_potential_scale: float = float(
+        self._hold_potential_ratio: float = float(
             model_reward_parameters.get(
-                "hold_potential_scale", ReforceXY.DEFAULT_HOLD_POTENTIAL_SCALE
+                "hold_potential_ratio", ReforceXY.DEFAULT_HOLD_POTENTIAL_RATIO
             )
         )
         self._hold_potential_gain: float = float(
@@ -1831,9 +1831,9 @@ class MyRLEnv(Base5ActionRLEnv):
                 "exit_additive_enabled", ReforceXY.DEFAULT_EXIT_ADDITIVE_ENABLED
             )
         )
-        self._exit_additive_scale: float = float(
+        self._exit_additive_ratio: float = float(
             model_reward_parameters.get(
-                "exit_additive_scale", ReforceXY.DEFAULT_EXIT_ADDITIVE_SCALE
+                "exit_additive_ratio", ReforceXY.DEFAULT_EXIT_ADDITIVE_RATIO
             )
         )
         self._exit_additive_gain: float = float(
@@ -2015,6 +2015,7 @@ class MyRLEnv(Base5ActionRLEnv):
         duration_ratio: float,
         pnl: float,
         pnl_target: float,
+        scale: float,
     ) -> float:
         """Compute PBRS potential Φ(s) for position holding states.
 
@@ -2027,7 +2028,7 @@ class MyRLEnv(Base5ActionRLEnv):
             pnl=pnl,
             pnl_target=pnl_target,
             duration_ratio=duration_ratio,
-            scale=self._hold_potential_scale,
+            scale=scale,
             gain=self._hold_potential_gain,
             transform_pnl=self._hold_potential_transform_pnl,
             transform_duration=self._hold_potential_transform_duration,
@@ -2039,6 +2040,7 @@ class MyRLEnv(Base5ActionRLEnv):
         pnl: float,
         pnl_target: float,
         duration_ratio: float,
+        scale: float,
     ) -> float:
         """Compute exit additive reward for position exit transitions.
 
@@ -2051,7 +2053,7 @@ class MyRLEnv(Base5ActionRLEnv):
             pnl=pnl,
             pnl_target=pnl_target,
             duration_ratio=duration_ratio,
-            scale=self._exit_additive_scale,
+            scale=scale,
             gain=self._exit_additive_gain,
             transform_pnl=self._exit_additive_transform_pnl,
             transform_duration=self._exit_additive_transform_duration,
@@ -2062,6 +2064,7 @@ class MyRLEnv(Base5ActionRLEnv):
         pnl: float,
         pnl_target: float,
         duration_ratio: float,
+        scale: float,
     ) -> float:
         """Compute entry additive reward for position entry transitions.
 
@@ -2074,7 +2077,7 @@ class MyRLEnv(Base5ActionRLEnv):
             pnl=pnl,
             pnl_target=pnl_target,
             duration_ratio=duration_ratio,
-            scale=self._entry_additive_scale,
+            scale=scale,
             gain=self._entry_additive_gain,
             transform_pnl=self._entry_additive_transform_pnl,
             transform_duration=self._entry_additive_transform_duration,
@@ -2208,6 +2211,9 @@ class MyRLEnv(Base5ActionRLEnv):
         max_trade_duration: float,
         pnl: float,
         pnl_target: float,
+        hold_potential_scale: float,
+        entry_additive_scale: float,
+        exit_additive_scale: float,
     ) -> tuple[float, float, float]:
         """Compute potential-based reward shaping (PBRS) components.
 
@@ -2240,6 +2246,7 @@ class MyRLEnv(Base5ActionRLEnv):
         **State Variables:**
             r_pnl         : pnl / pnl_target (PnL ratio)
             r_dur         : duration / max_duration (duration ratio, clamp [0,1])
+            scale         : scale parameter
             g             : gain parameter
             T_x           : transform function (tanh, softsign, etc.)
 
@@ -2347,6 +2354,12 @@ class MyRLEnv(Base5ActionRLEnv):
             Current position PnL (for current state s)
         pnl_target : float
             Target PnL for ratio normalization: r_pnl = pnl / pnl_target
+        hold_potential_scale : float
+            Magnitude scale for hold potential (= hold_potential_ratio * base_factor)
+        entry_additive_scale : float
+            Magnitude scale for entry additive (= entry_additive_ratio * base_factor)
+        exit_additive_scale : float
+            Magnitude scale for exit additive (= exit_additive_ratio * base_factor)
 
         Returns
         -------
@@ -2418,7 +2431,11 @@ class MyRLEnv(Base5ActionRLEnv):
         if is_entry or is_hold:
             if self._hold_potential_enabled:
                 next_potential = self._compute_hold_potential(
-                    next_position, next_duration_ratio, next_pnl, pnl_target
+                    next_position,
+                    next_duration_ratio,
+                    next_pnl,
+                    pnl_target,
+                    hold_potential_scale,
                 )
                 reward_shaping = gamma * next_potential - prev_potential
             else:
@@ -2431,9 +2448,10 @@ class MyRLEnv(Base5ActionRLEnv):
                 and not self.is_pbrs_invariant_mode()
             ):
                 entry_additive = self._compute_entry_additive(
-                    pnl=next_pnl,
-                    pnl_target=pnl_target,
-                    duration_ratio=next_duration_ratio,
+                    next_pnl,
+                    pnl_target,
+                    next_duration_ratio,
+                    entry_additive_scale,
                 )
                 self._total_entry_additive += float(entry_additive)
 
@@ -2454,7 +2472,7 @@ class MyRLEnv(Base5ActionRLEnv):
             if self._exit_additive_enabled and not self.is_pbrs_invariant_mode():
                 duration_ratio = trade_duration / max(max_trade_duration, 1)
                 exit_additive = self._compute_exit_additive(
-                    pnl, pnl_target, duration_ratio
+                    pnl, pnl_target, duration_ratio, exit_additive_scale
                 )
                 self._total_exit_additive += float(exit_additive)
 
@@ -2646,7 +2664,7 @@ class MyRLEnv(Base5ActionRLEnv):
         model_reward_parameters: Mapping[str, Any],
     ) -> float:
         """
-        Compute exit factor: base_factor × time_attenuation_coefficient x pnl_target_coefficient x efficiency_coefficient.
+        Compute exit factor: base_factor × time_attenuation_coefficient × pnl_target_coefficient × efficiency_coefficient.
         """
         if not (
             np.isfinite(base_factor)
@@ -2833,7 +2851,7 @@ class MyRLEnv(Base5ActionRLEnv):
         base_factor = float(
             model_reward_parameters.get("base_factor", ReforceXY.DEFAULT_BASE_FACTOR)
         )
-        idle_factor = base_factor * (self.profit_aim / self.rr) / 4.0
+        idle_factor = base_factor * (self.profit_aim / self.rr)
         hold_factor = idle_factor
 
         # 2. Idle penalty
@@ -2889,7 +2907,7 @@ class MyRLEnv(Base5ActionRLEnv):
                 self._last_hold_penalty = float(base_reward)
 
         # 4. Exit rewards
-        pnl = self.get_unrealized_profit()
+        pnl: float = self.get_unrealized_profit()
         if (
             base_reward is None
             and action == Actions.Long_exit.value
@@ -2914,12 +2932,19 @@ class MyRLEnv(Base5ActionRLEnv):
             base_reward = 0.0
 
         # 6. Potential-based reward shaping
+        hold_potential_scale = self._hold_potential_ratio * base_factor
+        entry_additive_scale = self._entry_additive_ratio * base_factor
+        exit_additive_scale = self._exit_additive_ratio * base_factor
+
         reward_shaping, entry_additive, exit_additive = self._compute_pbrs_components(
             action=action,
             trade_duration=trade_duration,
             max_trade_duration=max_trade_duration,
             pnl=pnl,
             pnl_target=self._pnl_target,
+            hold_potential_scale=hold_potential_scale,
+            entry_additive_scale=entry_additive_scale,
+            exit_additive_scale=exit_additive_scale,
         )
 
         return base_reward + reward_shaping + entry_additive + exit_additive