]> Piment Noir Git Repositories - freqai-strategies.git/commitdiff
refactor(ReforceXY): normalize tunables namespace
authorJérôme Benoit <jerome.benoit@piment-noir.org>
Sun, 28 Dec 2025 15:09:34 +0000 (16:09 +0100)
committerJérôme Benoit <jerome.benoit@piment-noir.org>
Sun, 28 Dec 2025 15:09:34 +0000 (16:09 +0100)
Signed-off-by: Jérôme Benoit <jerome.benoit@piment-noir.org>
ReforceXY/reward_space_analysis/README.md
ReforceXY/reward_space_analysis/reward_space_analysis.py
ReforceXY/reward_space_analysis/tests/components/test_reward_components.py
ReforceXY/reward_space_analysis/tests/helpers/test_internal_branches.py
ReforceXY/reward_space_analysis/tests/pbrs/test_pbrs.py
ReforceXY/user_data/freqaimodels/ReforceXY.py

index 95c53eefdf07f0fa3b326b593f1c1d5200e509f4..65450f4b43fb8527f136b180393ff28957721ab5 100644 (file)
@@ -52,7 +52,7 @@ Full test documentation: [tests/README.md](./tests/README.md).
   - [Reward & Shaping](#reward--shaping)
   - [Diagnostics & Validation](#diagnostics--validation)
   - [Overrides](#overrides)
-  - [Reward Parameter Cheat Sheet](#reward-parameter-cheat-sheet)
+  - [Reward Tunables Reference](#reward-tunables-reference)
   - [Exit Attenuation Kernels](#exit-attenuation-kernels)
   - [Transform Functions](#transform-functions)
   - [Skipping Feature Analysis](#skipping-feature-analysis)
@@ -220,7 +220,7 @@ be overridden via `--params`.
   scalars (`profit_aim`, `risk_reward_ratio`, `action_masking`). Conflicts:
   individual flags vs `--params` ⇒ `--params` wins.
 
-### Reward Parameter Cheat Sheet
+### Reward Tunables Reference
 
 #### Core
 
@@ -237,12 +237,12 @@ The exit factor is computed as:
 
 ##### PnL Target
 
-| Parameter           | Default | Description                   |
-| ------------------- | ------- | ----------------------------- |
-| `profit_aim`        | 0.03    | Profit target threshold       |
-| `risk_reward_ratio` | 2.0     | Risk/reward multiplier        |
-| `win_reward_factor` | 2.0     | Profit target bonus factor    |
-| `pnl_factor_beta`   | 0.5     | PnL amplification sensitivity |
+| Parameter                       | Default | Description                   |
+| ------------------------------- | ------- | ----------------------------- |
+| `profit_aim`                    | 0.03    | Profit target threshold       |
+| `risk_reward_ratio`             | 2.0     | Risk/reward multiplier        |
+| `win_reward_factor`             | 2.0     | Profit target bonus factor    |
+| `pnl_amplification_sensitivity` | 0.5     | PnL amplification sensitivity |
 
 **Note:** In ReforceXY, `risk_reward_ratio` maps to `rr`.
 
@@ -252,9 +252,9 @@ Let `pnl_target = profit_aim × risk_reward_ratio`, `pnl_ratio = pnl / pnl_targe
 
 - If `pnl_target ≤ 0`: `pnl_target_coefficient = 1.0`
 - If `pnl_ratio > 1.0`:
-  `pnl_target_coefficient = 1.0 + win_reward_factor × tanh(pnl_factor_beta × (pnl_ratio − 1.0))`
+  `pnl_target_coefficient = 1.0 + win_reward_factor × tanh(pnl_amplification_sensitivity × (pnl_ratio − 1.0))`
 - If `pnl_ratio < −(1.0 / risk_reward_ratio)`:
-  `pnl_target_coefficient = 1.0 + (win_reward_factor × risk_reward_ratio) × tanh(pnl_factor_beta × (|pnl_ratio| − 1.0))`
+  `pnl_target_coefficient = 1.0 + (win_reward_factor × risk_reward_ratio) × tanh(pnl_amplification_sensitivity × (|pnl_ratio| − 1.0))`
 - Else: `pnl_target_coefficient = 1.0`
 
 ##### Efficiency
@@ -465,7 +465,7 @@ uv run python reward_space_analysis.py --params win_reward_factor=3.0 idle_penal
 `risk_reward_ratio`, `action_masking`.
 
 **Reward tunables** (tunable via either direct flag or `--params`) correspond to
-those listed under Reward Parameter Cheat Sheet: Core, Duration Penalties, Exit
+those listed under Reward Tunables Reference: Core, Duration Penalties, Exit
 Attenuation, Efficiency, Validation, PBRS, Hold/Entry/Exit Potential Transforms.
 
 ## Examples
index cbac069806ddb65ae61c5173ec8a9edd617a3dc4..1db151c596db80b0692662d289ba0b4375114ee9 100644 (file)
@@ -151,7 +151,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS: RewardParams = {
     "efficiency_center": 0.5,
     # Profit factor defaults
     "win_reward_factor": 2.0,
-    "pnl_factor_beta": 0.5,
+    "pnl_amplification_sensitivity": 0.5,
     # Invariant / safety defaults
     "check_invariants": True,
     "exit_factor_threshold": 1000.0,
@@ -202,7 +202,7 @@ DEFAULT_MODEL_REWARD_PARAMETERS_HELP: dict[str, str] = {
     "efficiency_weight": "Efficiency weight",
     "efficiency_center": "Efficiency pivot in [0,1]",
     "win_reward_factor": "Profit overshoot bonus factor",
-    "pnl_factor_beta": "PnL amplification sensitivity",
+    "pnl_amplification_sensitivity": "PnL amplification sensitivity",
     "check_invariants": "Enable runtime invariant checks",
     "exit_factor_threshold": "Warn if |exit_factor| exceeds",
     # PBRS parameters
@@ -250,7 +250,7 @@ _PARAMETER_BOUNDS: dict[str, dict[str, float]] = {
     "efficiency_weight": {"min": 0.0, "max": 2.0},
     "efficiency_center": {"min": 0.0, "max": 1.0},
     "win_reward_factor": {"min": 0.0},
-    "pnl_factor_beta": {"min": 1e-6},
+    "pnl_amplification_sensitivity": {"min": 1e-6},
     # PBRS parameter bounds
     "potential_gamma": {"min": 0.0, "max": 1.0},
     "exit_potential_decay": {"min": 0.0, "max": 1.0},
@@ -992,12 +992,14 @@ def _compute_pnl_target_coefficient(
 
     if pnl_target > 0.0:
         win_reward_factor = _get_float_param(params, "win_reward_factor")
-        pnl_factor_beta = _get_float_param(params, "pnl_factor_beta")
+        pnl_amplification_sensitivity = _get_float_param(params, "pnl_amplification_sensitivity")
         rr = risk_reward_ratio if risk_reward_ratio > 0 else RISK_REWARD_RATIO_DEFAULT
 
         pnl_ratio = pnl / pnl_target
         if abs(pnl_ratio) > 1.0:
-            base_pnl_target_coefficient = math.tanh(pnl_factor_beta * (abs(pnl_ratio) - 1.0))
+            base_pnl_target_coefficient = math.tanh(
+                pnl_amplification_sensitivity * (abs(pnl_ratio) - 1.0)
+            )
             if pnl_ratio > 1.0:
                 pnl_target_coefficient = 1.0 + win_reward_factor * base_pnl_target_coefficient
             elif pnl_ratio < -(1.0 / rr):
@@ -1285,7 +1287,7 @@ def calculate_reward(
             center_unrealized = 0.5 * (
                 context.max_unrealized_profit + context.min_unrealized_profit
             )
-            beta = _get_float_param(params, "pnl_factor_beta")
+            beta = _get_float_param(params, "pnl_amplification_sensitivity")
             next_pnl = float(center_unrealized * math.tanh(beta * next_duration_ratio))
         else:
             next_pnl = current_pnl
index f5410d6fad9b2b568a26656418e3b06f32deb542..da99aa18dd37b715fbe78456ef75c5bab6eb81dd 100644 (file)
@@ -220,13 +220,13 @@ class TestRewardComponents(RewardSpaceTestBase):
         **Setup:**
         - PnL: 150% of pnl_target (exceeds target by 50%)
         - pnl_target: 0.045 (profit_aim=0.03 * risk_reward_ratio=1.5)
-        - Parameters: win_reward_factor=2.0, pnl_factor_beta=0.5
+        - Parameters: win_reward_factor=2.0, pnl_amplification_sensitivity=0.5
 
         **Assertions:**
         - Coefficient is finite
         - Coefficient > 1.0 (rewards exceeding target)
         """
-        params = self.base_params(win_reward_factor=2.0, pnl_factor_beta=0.5)
+        params = self.base_params(win_reward_factor=2.0, pnl_amplification_sensitivity=0.5)
         profit_aim = 0.03
         risk_reward_ratio = 1.5
         pnl_target = profit_aim * risk_reward_ratio
@@ -252,13 +252,13 @@ class TestRewardComponents(RewardSpaceTestBase):
         - PnL: -0.06 (exceeds pnl_target magnitude)
         - pnl_target: 0.045 (profit_aim=0.03 * risk_reward_ratio=1.5)
         - Penalty threshold: pnl < -pnl_target = -0.045
-        - Parameters: win_reward_factor=2.0, pnl_factor_beta=0.5
+        - Parameters: win_reward_factor=2.0, pnl_amplification_sensitivity=0.5
 
         **Assertions:**
         - Coefficient is finite
         - Coefficient > 1.0 (amplifies loss penalty)
         """
-        params = self.base_params(win_reward_factor=2.0, pnl_factor_beta=0.5)
+        params = self.base_params(win_reward_factor=2.0, pnl_amplification_sensitivity=0.5)
         profit_aim = 0.03
         risk_reward_ratio = 1.5
         pnl_target = profit_aim * risk_reward_ratio  # 0.045
@@ -652,7 +652,7 @@ class TestRewardComponents(RewardSpaceTestBase):
         pnl_target = profit_aim * risk_reward_ratio
         params = self.base_params(
             win_reward_factor=win_reward_factor,
-            pnl_factor_beta=beta,
+            pnl_amplification_sensitivity=beta,
             efficiency_weight=0.0,
             exit_attenuation_mode="linear",
             exit_plateau=False,
index 61419c8dfe4f2f7388b1ca8badfbfee4ef3a2a64..ce62d88fab08997031a6340c9e107aeb26da5ff1 100644 (file)
@@ -68,7 +68,7 @@ def test_calculate_reward_unrealized_pnl_hold_path():
     params = {
         "hold_potential_enabled": True,
         "unrealized_pnl": True,
-        "pnl_factor_beta": 0.5,
+        "pnl_amplification_sensitivity": 0.5,
     }
     breakdown = calculate_reward_with_defaults(
         context,
index cbbc3e516f76cdb6b79bea2032936ca0acf6e583..fb56d6307a1709649901fbcf6adfafe013b89464 100644 (file)
@@ -132,7 +132,9 @@ class TestPBRS(RewardSpaceTestBase):
         )
 
         gamma = _get_float_param(
-            params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95)
+            params,
+            "potential_gamma",
+            DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95),
         )
         expected_next_potential = (
             prev_potential / gamma if gamma not in (0.0, None) else prev_potential
@@ -872,7 +874,9 @@ class TestPBRS(RewardSpaceTestBase):
             potential_gamma=0.9,
         )
         gamma = _get_float_param(
-            params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95)
+            params,
+            "potential_gamma",
+            DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95),
         )
         rng = np.random.default_rng(555)
         potentials = rng.uniform(0.05, 0.85, size=220)
@@ -1126,7 +1130,9 @@ class TestPBRS(RewardSpaceTestBase):
             exit_potential_mode="canonical",
         )
         gamma = _get_float_param(
-            params, "potential_gamma", DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95)
+            params,
+            "potential_gamma",
+            DEFAULT_MODEL_REWARD_PARAMETERS.get("potential_gamma", 0.95),
         )
         rng = np.random.default_rng(321)
         prev_potential = 0.0
index 6c8fa158baad8dee1f03a7d008ae9443394baa74..82459f989a54c12ca599f1f7ba42ca3a152ad584 100644 (file)
@@ -172,7 +172,7 @@ class ReforceXY(BaseReinforcementLearningModel):
     DEFAULT_EXIT_LINEAR_SLOPE: Final[float] = 1.0
     DEFAULT_EXIT_HALF_LIFE: Final[float] = 0.5
 
-    DEFAULT_PNL_FACTOR_BETA: Final[float] = 0.5
+    DEFAULT_PNL_AMPLIFICATION_SENSITIVITY: Final[float] = 0.5
     DEFAULT_WIN_REWARD_FACTOR: Final[float] = 2.0
     DEFAULT_EFFICIENCY_WEIGHT: Final[float] = 1.0
     DEFAULT_EFFICIENCY_CENTER: Final[float] = 0.5
@@ -2895,16 +2895,17 @@ class MyRLEnv(Base5ActionRLEnv):
         pnl_target_coefficient = 1.0
 
         if pnl_target > 0.0:
-            pnl_factor_beta = float(
+            pnl_amplification_sensitivity = float(
                 model_reward_parameters.get(
-                    "pnl_factor_beta", ReforceXY.DEFAULT_PNL_FACTOR_BETA
+                    "pnl_amplification_sensitivity",
+                    ReforceXY.DEFAULT_PNL_AMPLIFICATION_SENSITIVITY,
                 )
             )
             pnl_ratio = pnl / pnl_target
 
             if abs(pnl_ratio) > 1.0:
                 base_pnl_target_coefficient = math.tanh(
-                    pnl_factor_beta * (abs(pnl_ratio) - 1.0)
+                    pnl_amplification_sensitivity * (abs(pnl_ratio) - 1.0)
                 )
                 win_reward_factor = float(
                     model_reward_parameters.get(